-
Notifications
You must be signed in to change notification settings - Fork 3k
Parquet: Support constant map for partition values #909
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,14 +29,11 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.UUID; | ||
| import org.apache.avro.generic.GenericData; | ||
| import org.apache.avro.io.Decoder; | ||
| import org.apache.avro.util.Utf8; | ||
| import org.apache.iceberg.avro.ValueReader; | ||
| import org.apache.iceberg.avro.ValueReaders; | ||
| import org.apache.iceberg.types.Type; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.util.ByteBuffers; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
| import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; | ||
| import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; | ||
|
|
@@ -287,30 +284,5 @@ protected void set(InternalRow struct, int pos, Object value) { | |
| struct.setNullAt(pos); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| protected Object prepareConstant(Type type, Object value) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved into Spark. |
||
| switch (type.typeId()) { | ||
| case DECIMAL: | ||
| return Decimal.apply((BigDecimal) value); | ||
| case STRING: | ||
| if (value instanceof Utf8) { | ||
| Utf8 utf8 = (Utf8) value; | ||
| return UTF8String.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); | ||
| } | ||
| return UTF8String.fromString(value.toString()); | ||
| case FIXED: | ||
| if (value instanceof byte[]) { | ||
| return value; | ||
| } else if (value instanceof GenericData.Fixed) { | ||
| return ((GenericData.Fixed) value).bytes(); | ||
| } | ||
| return ByteBuffers.toByteArray((ByteBuffer) value); | ||
| case BINARY: | ||
| return ByteBuffers.toByteArray((ByteBuffer) value); | ||
| default: | ||
| } | ||
| return value; | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,7 @@ | |
| import org.apache.spark.sql.SparkSession; | ||
| import org.junit.AfterClass; | ||
| import org.junit.Assert; | ||
| import org.junit.Assume; | ||
| import org.junit.BeforeClass; | ||
| import org.junit.Rule; | ||
| import org.junit.Test; | ||
|
|
@@ -307,4 +308,72 @@ public void testPartitionValueTypes() throws Exception { | |
| TestTables.clearTables(); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testNestedPartitionValues() throws Exception { | ||
| Assume.assumeTrue("ORC can't project nested partition values", !format.equalsIgnoreCase("orc")); | ||
|
|
||
| String[] columnNames = new String[] { | ||
| "b", "i", "l", "f", "d", "date", "ts", "s", "bytes", "dec_9_0", "dec_11_2", "dec_38_10" | ||
| }; | ||
|
|
||
| HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf()); | ||
| Schema nestedSchema = new Schema(optional(1, "nested", SUPPORTED_PRIMITIVES.asStruct())); | ||
|
|
||
| // create a table around the source data | ||
| String sourceLocation = temp.newFolder("source_table").toString(); | ||
| Table source = tables.create(nestedSchema, sourceLocation); | ||
|
|
||
| // write out an Avro data file with all of the data types for source data | ||
| List<GenericData.Record> expected = RandomData.generateList(source.schema(), 2, 128735L); | ||
| File avroData = temp.newFile("data.avro"); | ||
| Assert.assertTrue(avroData.delete()); | ||
| try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(avroData)) | ||
| .schema(source.schema()) | ||
| .build()) { | ||
| appender.addAll(expected); | ||
| } | ||
|
|
||
| // add the Avro data file to the source table | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not write the data for the parameterized format for which the test is running?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just source data for the write from Spark with the target format. Since it isn't part of the test, we don't want it to change at all in ways that might affect the test. |
||
| source.newAppend() | ||
| .appendFile(DataFiles.fromInputFile(Files.localInput(avroData), 10)) | ||
| .commit(); | ||
|
|
||
| Dataset<Row> sourceDF = spark.read().format("iceberg").load(sourceLocation); | ||
|
|
||
| try { | ||
| for (String column : columnNames) { | ||
| String desc = "partition_by_" + SUPPORTED_PRIMITIVES.findType(column).toString(); | ||
|
|
||
| File parent = temp.newFolder(desc); | ||
| File location = new File(parent, "test"); | ||
| File dataFolder = new File(location, "data"); | ||
| Assert.assertTrue("mkdirs should succeed", dataFolder.mkdirs()); | ||
|
|
||
| PartitionSpec spec = PartitionSpec.builderFor(nestedSchema).identity("nested." + column).build(); | ||
|
|
||
| Table table = tables.create(nestedSchema, spec, location.toString()); | ||
| table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); | ||
|
|
||
| sourceDF.write() | ||
| .format("iceberg") | ||
| .mode("append") | ||
| .save(location.toString()); | ||
|
|
||
| List<Row> actual = spark.read() | ||
| .format("iceberg") | ||
| .load(location.toString()) | ||
| .collectAsList(); | ||
|
|
||
| Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); | ||
|
|
||
| for (int i = 0; i < expected.size(); i += 1) { | ||
| TestHelpers.assertEqualsSafe( | ||
| nestedSchema.asStruct(), expected.get(i), actual.get(i)); | ||
| } | ||
| } | ||
| } finally { | ||
| TestTables.clearTables(); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm moving this out of Avro and adding a callback to convert the constants to
PartitionUtil.constantsMap. That way, Spark can supply a conversion function and use it in both places, instead of duplicating the conversion in Avro and Parquet readers.