From f31ccd3b2643ea5b3dbeaffa86cdeb3adbcd54eb Mon Sep 17 00:00:00 2001 From: Joshua Kolash Date: Fri, 4 Oct 2024 09:18:30 -0400 Subject: [PATCH 1/3] Demonstrate bug for issue #11253 Show that we do not bound the number of column statistics if the nested struct had 200 columns we would end up with over 200 statistics. --- .../apache/iceberg/io/TestWriterMetrics.java | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java index d1a782057006..afc021cdb724 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java +++ b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java @@ -20,11 +20,13 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.iceberg.DataFile; @@ -241,6 +243,43 @@ public void testPositionDeleteMetricsCoveringMultipleDataFiles() throws IOExcept 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); } + + @Test + public void testMaxColumnsBounded() throws IOException { + File tableDir = temp.newFolder(); + tableDir.delete(); // created by table create + + List fields = Arrays.asList(ID_FIELD, DATA_FIELD, STRUCT_FIELD); + + Schema maxColSchema = new Schema(fields); + + Table maxColumnTable = + TestTables.create( + tableDir, + "max_col_table", + maxColSchema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + FORMAT_V2); + + long maxInferredColumns = 3; + + maxColumnTable.updateProperties().set(TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, String.valueOf(maxInferredColumns)).commit(); + + OutputFileFactory maxColFactory = + OutputFileFactory.builderFor(maxColumnTable, 1, 1).format(fileFormat).build(); + + T row = toRow(1,"data", false, Long.MAX_VALUE); + DataWriter dataWriter = + newWriterFactory(maxColumnTable) + .newDataWriter(maxColFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); + dataWriter.write(row); + dataWriter.close(); + DataFile dataFile = dataWriter.toDataFile(); + assertThat(dataFile.upperBounds().keySet().size()).isEqualTo(maxInferredColumns); + + } + @Test public void testMaxColumns() throws IOException { File tableDir = temp.newFolder(); From 282c5b99ca68432f1602c139949ae6aeffc13810 Mon Sep 17 00:00:00 2001 From: Joshua Kolash Date: Fri, 4 Oct 2024 09:29:52 -0400 Subject: [PATCH 2/3] spotless apply --- .../apache/iceberg/io/TestWriterMetrics.java | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java index afc021cdb724..75fd4d09d0c5 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java +++ b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java @@ -243,7 +243,6 @@ public void testPositionDeleteMetricsCoveringMultipleDataFiles() throws IOExcept 3L, (long) Conversions.fromByteBuffer(Types.LongType.get(), upperBounds.get(5))); } - @Test public void testMaxColumnsBounded() throws IOException { File tableDir = temp.newFolder(); @@ -254,30 +253,34 @@ public void testMaxColumnsBounded() throws IOException { Schema maxColSchema = new Schema(fields); Table maxColumnTable = - TestTables.create( - tableDir, - "max_col_table", - maxColSchema, - PartitionSpec.unpartitioned(), - SortOrder.unsorted(), - FORMAT_V2); + TestTables.create( + tableDir, + "max_col_table", + maxColSchema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + FORMAT_V2); long maxInferredColumns = 3; - maxColumnTable.updateProperties().set(TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, String.valueOf(maxInferredColumns)).commit(); + maxColumnTable + .updateProperties() + .set( + TableProperties.METRICS_MAX_INFERRED_COLUMN_DEFAULTS, + String.valueOf(maxInferredColumns)) + .commit(); OutputFileFactory maxColFactory = - OutputFileFactory.builderFor(maxColumnTable, 1, 1).format(fileFormat).build(); + OutputFileFactory.builderFor(maxColumnTable, 1, 1).format(fileFormat).build(); - T row = toRow(1,"data", false, Long.MAX_VALUE); + T row = toRow(1, "data", false, Long.MAX_VALUE); DataWriter dataWriter = - newWriterFactory(maxColumnTable) - .newDataWriter(maxColFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); + newWriterFactory(maxColumnTable) + .newDataWriter(maxColFactory.newOutputFile(), PartitionSpec.unpartitioned(), null); dataWriter.write(row); dataWriter.close(); DataFile dataFile = dataWriter.toDataFile(); assertThat(dataFile.upperBounds().keySet().size()).isEqualTo(maxInferredColumns); - } @Test From b7b703bac568d90600b6b18394464fb2792b9839 Mon Sep 17 00:00:00 2001 From: Joshua Kolash Date: Fri, 4 Oct 2024 09:45:14 -0400 Subject: [PATCH 3/3] address merge conflicts --- .../test/java/org/apache/iceberg/io/TestWriterMetrics.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java index 7851b722cf0d..7846b1f639be 100644 --- a/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java +++ b/data/src/test/java/org/apache/iceberg/io/TestWriterMetrics.java @@ -243,8 +243,8 @@ public void testPositionDeleteMetricsCoveringMultipleDataFiles() throws IOExcept @TestTemplate public void testMaxColumnsBounded() throws IOException { - File tableDir = temp.newFolder(); - tableDir.delete(); // created by table create + File tableDir = Files.createTempDirectory(tempDir.toPath(), "table").toFile(); + assertThat(tableDir.delete()).isTrue(); List fields = Arrays.asList(ID_FIELD, DATA_FIELD, STRUCT_FIELD);