-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Fix Partitions table for evolved partition specs #4560
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
aac0ba8
1f2eb63
da0dc4d
983ec0e
34bcae8
5129068
b974f0e
fb53467
3cfb65c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,7 @@ | |
| import org.apache.spark.sql.types.StructType; | ||
| import org.junit.After; | ||
| import org.junit.Assert; | ||
| import org.junit.Assume; | ||
| import org.junit.Test; | ||
| import org.junit.runner.RunWith; | ||
| import org.junit.runners.Parameterized; | ||
|
|
@@ -57,6 +58,7 @@ | |
| import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; | ||
| import static org.apache.iceberg.MetadataTableType.ENTRIES; | ||
| import static org.apache.iceberg.MetadataTableType.FILES; | ||
| import static org.apache.iceberg.MetadataTableType.PARTITIONS; | ||
| import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; | ||
| import static org.apache.iceberg.TableProperties.FORMAT_VERSION; | ||
|
|
||
|
|
@@ -378,6 +380,171 @@ public void testEntriesMetadataTable() throws ParseException { | |
| tableType); | ||
| } | ||
| } | ||
| @Test | ||
| public void testPartitionsTableAddRemoveFields() throws ParseException { | ||
| sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + | ||
| "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", tableName); | ||
| initTable(); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| // verify the metadata tables while the current spec is still unpartitioned | ||
| Dataset<Row> df = loadMetadataTable(PARTITIONS); | ||
| Assert.assertTrue("Partition must be skipped", df.schema().getFieldIndex("partition").isEmpty()); | ||
|
|
||
| Table table = validationCatalog.loadTable(tableIdent); | ||
|
|
||
| table.updateSpec() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you could just do this in SparkSQL if you like and skip the refresh, but this is fine too.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will just keep it for now then, matches existing tests |
||
| .addField("data") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| // verify the metadata tables after adding the first partition column | ||
| assertPartitions( | ||
| ImmutableList.of(row(new Object[]{null}), row("d1"), row("d2")), | ||
| "STRUCT<data:STRING>", | ||
| PARTITIONS); | ||
|
|
||
| table.updateSpec() | ||
| .addField("category") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| // verify the metadata tables after adding the second partition column | ||
| assertPartitions(ImmutableList.of( | ||
| row(null, null), | ||
| row("d1", null), | ||
| row("d1", "c1"), | ||
| row("d2", null), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category:STRING>", | ||
| PARTITIONS); | ||
|
|
||
| // verify the metadata tables after removing the first partition column | ||
| table.updateSpec() | ||
| .removeField("data") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions( | ||
| ImmutableList.of( | ||
| row(null, null), | ||
| row(null, "c1"), | ||
| row(null, "c2"), | ||
| row("d1", null), | ||
| row("d1", "c1"), | ||
| row("d2", null), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category:STRING>", | ||
| PARTITIONS); | ||
| } | ||
|
|
||
| @Test | ||
| public void testPartitionsTableRenameFields() throws ParseException { | ||
| sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg " + | ||
| "TBLPROPERTIES ('commit.manifest-merge.enabled' 'false')", tableName); | ||
| initTable(); | ||
|
|
||
| Table table = validationCatalog.loadTable(tableIdent); | ||
|
|
||
| table.updateSpec() | ||
| .addField("data") | ||
| .addField("category") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions(ImmutableList.of( | ||
| row("d1", "c1"), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category:STRING>", | ||
| PARTITIONS); | ||
|
|
||
| table.updateSpec() | ||
| .renameField("category", "category_another_name") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions( | ||
| ImmutableList.of( | ||
| row("d1", "c1"), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category_another_name:STRING>", | ||
| PARTITIONS); | ||
| } | ||
|
|
||
| @Test | ||
| public void testPartitionsTableSwitchFields() throws Exception { | ||
| // Re-added partition fields currently not re-associated: https://github.com/apache/iceberg/issues/4292 | ||
| // In V1, dropped partition fields show separately when field is re-added | ||
| // In V2, re-added field currently conflicts with its deleted form | ||
| Assume.assumeTrue(formatVersion == 1); | ||
|
|
||
| sql("CREATE TABLE %s (id bigint NOT NULL, category string, data string) USING iceberg", tableName); | ||
| initTable(); | ||
| Table table = validationCatalog.loadTable(tableIdent); | ||
|
|
||
| // verify the metadata tables after re-adding the first dropped column in the second location | ||
| table.updateSpec() | ||
| .addField("data") | ||
| .addField("category") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
|
|
||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions(ImmutableList.of( | ||
| row("d1", "c1"), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category:STRING>", | ||
| PARTITIONS); | ||
|
|
||
| table.updateSpec() | ||
| .removeField("data") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
|
|
||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions( | ||
| ImmutableList.of( | ||
| row(null, "c1"), | ||
| row(null, "c2"), | ||
| row("d1", "c1"), | ||
| row("d2", "c2")), | ||
| "STRUCT<data:STRING,category:STRING>", | ||
| PARTITIONS); | ||
|
|
||
| table.updateSpec() | ||
| .addField("data") | ||
| .commit(); | ||
| sql("REFRESH TABLE %s", tableName); | ||
|
|
||
| sql("INSERT INTO TABLE %s VALUES (1, 'c1', 'd1')", tableName); | ||
| sql("INSERT INTO TABLE %s VALUES (2, 'c2', 'd2')", tableName); | ||
|
|
||
| assertPartitions( | ||
| ImmutableList.of( | ||
| row(null, "c1", null), | ||
| row(null, "c1", "d1"), | ||
| row(null, "c2", null), | ||
| row(null, "c2", "d2"), | ||
| row("d1", "c1", null), | ||
| row("d2", "c2", null)), | ||
| "STRUCT<data_1000:STRING,category:STRING,data:STRING>", | ||
| PARTITIONS); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMetadataTablesWithUnknownTransforms() { | ||
|
|
@@ -429,6 +596,7 @@ private void assertPartitions(List<Object[]> expectedPartitions, String expected | |
|
|
||
| DataType expectedType = spark.sessionState().sqlParser().parseDataType(expectedTypeAsString); | ||
| switch (tableType) { | ||
| case PARTITIONS: | ||
| case FILES: | ||
| case ALL_DATA_FILES: | ||
| DataType actualFilesType = df.schema().apply("partition").dataType(); | ||
|
|
@@ -447,6 +615,7 @@ private void assertPartitions(List<Object[]> expectedPartitions, String expected | |
| } | ||
|
|
||
| switch (tableType) { | ||
| case PARTITIONS: | ||
| case FILES: | ||
| case ALL_DATA_FILES: | ||
| List<Row> actualFilesPartitions = df.orderBy("partition") | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit confused by this change, I believe the issue here is that StructType does not have a well defined hashFunction (since implementations can do whatever they like) which is why we use the Wrapper to make sure we have a valid hash. (and equals)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to map of PartitionData (I feel, it should have been that way in the beginning)