Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions core/src/main/java/org/apache/iceberg/MetricsUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,36 @@ public class MetricsUtil {
private MetricsUtil() {}

/**
* Copies a metrics object without lower and upper bounds for given fields.
* Copies a metrics object without value, NULL and NaN counts for given fields.
*
* @param excludedFieldIds field IDs for which the lower and upper bounds must be dropped
* @param excludedFieldIds field IDs for which the counts must be dropped
* @return a new metrics object without counts for given fields
*/
public static Metrics copyWithoutFieldCounts(Metrics metrics, Set<Integer> excludedFieldIds) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered creating a builder for coping metrics but I am not sure it would be worth the extra complexity and it is not clear whether there would be future use cases benefiting from it. For now, I simply added another util method.

return new Metrics(
metrics.recordCount(),
metrics.columnSizes(),
copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
metrics.lowerBounds(),
metrics.upperBounds());
}

/**
* Copies a metrics object without counts and bounds for given fields.
*
* @param excludedFieldIds field IDs for which the counts and bounds must be dropped
* @return a new metrics object without lower and upper bounds for given fields
*/
public static Metrics copyWithoutFieldBounds(Metrics metrics, Set<Integer> excludedFieldIds) {
public static Metrics copyWithoutFieldCountsAndBounds(
Metrics metrics, Set<Integer> excludedFieldIds) {
return new Metrics(
metrics.recordCount(),
metrics.columnSizes(),
metrics.valueCounts(),
metrics.nullValueCounts(),
metrics.nanValueCounts(),
copyWithoutKeys(metrics.valueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nullValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.nanValueCounts(), excludedFieldIds),
copyWithoutKeys(metrics.lowerBounds(), excludedFieldIds),
copyWithoutKeys(metrics.upperBounds(), excludedFieldIds));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
* records, consider using {@link SortingPositionOnlyDeleteWriter} instead.
*/
public class PositionDeleteWriter<T> implements FileWriter<PositionDelete<T>, DeleteWriteResult> {
private static final Set<Integer> SINGLE_REFERENCED_FILE_BOUNDS_ONLY =
private static final Set<Integer> FILE_AND_POS_FIELD_IDS =
ImmutableSet.of(DELETE_FILE_PATH.fieldId(), DELETE_FILE_POS.fieldId());

private final FileAppender<StructLike> appender;
Expand Down Expand Up @@ -121,9 +121,9 @@ public DeleteWriteResult result() {
private Metrics metrics() {
Metrics metrics = appender.metrics();
if (referencedDataFiles.size() > 1) {
return MetricsUtil.copyWithoutFieldBounds(metrics, SINGLE_REFERENCED_FILE_BOUNDS_ONLY);
return MetricsUtil.copyWithoutFieldCountsAndBounds(metrics, FILE_AND_POS_FIELD_IDS);
} else {
return metrics;
return MetricsUtil.copyWithoutFieldCounts(metrics, FILE_AND_POS_FIELD_IDS);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.CharSequenceSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.StructLikeSet;
Expand Down Expand Up @@ -232,14 +233,20 @@ public void testPositionDeleteWriter() throws IOException {
if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.columnSizes());
} else {
Assert.assertEquals(1, referencedDataFiles.size());
Assert.assertEquals(2, deleteFile.lowerBounds().size());
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertEquals(2, deleteFile.upperBounds().size());
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertEquals(2, deleteFile.columnSizes().size());
}

Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());

// verify the written delete file
GenericRecord deleteRecord = GenericRecord.create(DeleteSchemaUtil.pathPosSchema());
List<Record> expectedDeletes =
Expand Down Expand Up @@ -281,6 +288,34 @@ public void testPositionDeleteWriterWithRow() throws IOException {
DeleteFile deleteFile = result.first();
CharSequenceSet referencedDataFiles = result.second();

if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.columnSizes());
Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());
} else {
Assert.assertEquals(1, referencedDataFiles.size());
Assert.assertEquals(4, deleteFile.lowerBounds().size());
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertTrue(deleteFile.lowerBounds().containsKey(DELETE_FILE_POS.fieldId()));
for (Types.NestedField column : table.schema().columns()) {
Assert.assertTrue(deleteFile.lowerBounds().containsKey(column.fieldId()));
}
Assert.assertEquals(4, deleteFile.upperBounds().size());
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_PATH.fieldId()));
Assert.assertTrue(deleteFile.upperBounds().containsKey(DELETE_FILE_POS.fieldId()));
for (Types.NestedField column : table.schema().columns()) {
Assert.assertTrue(deleteFile.upperBounds().containsKey(column.fieldId()));
}
// ORC also contains metrics for the deleted row struct, not just actual data fields
Assert.assertTrue(deleteFile.columnSizes().size() >= 4);
Assert.assertTrue(deleteFile.valueCounts().size() >= 2);
Assert.assertTrue(deleteFile.nullValueCounts().size() >= 2);
Assert.assertNull(deleteFile.nanValueCounts());
}

// verify the written delete file
GenericRecord deletedRow = GenericRecord.create(table.schema());
Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(table.schema());
Expand Down Expand Up @@ -336,6 +371,15 @@ public void testPositionDeleteWriterMultipleDataFiles() throws IOException {
Assert.assertEquals(2, referencedDataFiles.size());
Assert.assertNull(deleteFile.lowerBounds());
Assert.assertNull(deleteFile.upperBounds());
Assert.assertNull(deleteFile.valueCounts());
Assert.assertNull(deleteFile.nullValueCounts());
Assert.assertNull(deleteFile.nanValueCounts());

if (fileFormat == FileFormat.AVRO) {
Assert.assertNull(deleteFile.columnSizes());
} else {
Assert.assertEquals(2, deleteFile.columnSizes().size());
}

// commit the data and delete files
table
Expand Down