-
Notifications
You must be signed in to change notification settings - Fork 3k
Parquet: Use new logical type annotations #891
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,16 +28,19 @@ | |
| import java.time.ZoneOffset; | ||
| import java.time.temporal.ChronoUnit; | ||
| import java.util.List; | ||
| import java.util.Optional; | ||
| import org.apache.iceberg.data.Record; | ||
| import org.apache.iceberg.parquet.ParquetTypeVisitor; | ||
| import org.apache.iceberg.parquet.ParquetValueWriter; | ||
| import org.apache.iceberg.parquet.ParquetValueWriters; | ||
| import org.apache.iceberg.parquet.ParquetValueWriters.PrimitiveWriter; | ||
| import org.apache.iceberg.parquet.ParquetValueWriters.StructWriter; | ||
| import org.apache.parquet.Preconditions; | ||
| import org.apache.parquet.column.ColumnDescriptor; | ||
| import org.apache.parquet.io.api.Binary; | ||
| import org.apache.parquet.schema.DecimalMetadata; | ||
| import org.apache.parquet.schema.GroupType; | ||
| import org.apache.parquet.schema.LogicalTypeAnnotation; | ||
| import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; | ||
| import org.apache.parquet.schema.MessageType; | ||
| import org.apache.parquet.schema.PrimitiveType; | ||
| import org.apache.parquet.schema.Type; | ||
|
|
@@ -116,44 +119,11 @@ public ParquetValueWriter<?> map(GroupType map, | |
| @Override | ||
| public ParquetValueWriter<?> primitive(PrimitiveType primitive) { | ||
| ColumnDescriptor desc = type.getColumnDescription(currentPath()); | ||
|
|
||
| if (primitive.getOriginalType() != null) { | ||
| switch (primitive.getOriginalType()) { | ||
| case ENUM: | ||
| case JSON: | ||
| case UTF8: | ||
| return ParquetValueWriters.strings(desc); | ||
| case INT_8: | ||
| case INT_16: | ||
| case INT_32: | ||
| return ParquetValueWriters.ints(desc); | ||
| case INT_64: | ||
| return ParquetValueWriters.longs(desc); | ||
| case DATE: | ||
| return new DateWriter(desc); | ||
| case TIME_MICROS: | ||
| return new TimeWriter(desc); | ||
| case TIMESTAMP_MICROS: | ||
| return new TimestamptzWriter(desc); | ||
| case DECIMAL: | ||
| DecimalMetadata decimal = primitive.getDecimalMetadata(); | ||
| switch (primitive.getPrimitiveTypeName()) { | ||
| case INT32: | ||
| return ParquetValueWriters.decimalAsInteger(desc, decimal.getPrecision(), decimal.getScale()); | ||
| case INT64: | ||
| return ParquetValueWriters.decimalAsLong(desc, decimal.getPrecision(), decimal.getScale()); | ||
| case BINARY: | ||
| case FIXED_LEN_BYTE_ARRAY: | ||
| return ParquetValueWriters.decimalAsFixed(desc, decimal.getPrecision(), decimal.getScale()); | ||
| default: | ||
| throw new UnsupportedOperationException( | ||
| "Unsupported base type for decimal: " + primitive.getPrimitiveTypeName()); | ||
| } | ||
| case BSON: | ||
| return ParquetValueWriters.byteBuffers(desc); | ||
| default: | ||
| throw new UnsupportedOperationException( | ||
| "Unsupported logical type: " + primitive.getOriginalType()); | ||
| LogicalTypeAnnotation logicalType = primitive.getLogicalTypeAnnotation(); | ||
| if (logicalType != null) { | ||
| Optional<PrimitiveWriter<?>> writer = logicalType.accept(new LogicalTypeWriterVisitor(desc)); | ||
| if (writer.isPresent()) { | ||
| return writer.get(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -178,6 +148,83 @@ public ParquetValueWriter<?> primitive(PrimitiveType primitive) { | |
| } | ||
| } | ||
|
|
||
| private static class LogicalTypeWriterVisitor implements LogicalTypeAnnotationVisitor<PrimitiveWriter<?>> { | ||
| private final ColumnDescriptor desc; | ||
|
|
||
| private LogicalTypeWriterVisitor(ColumnDescriptor desc) { | ||
| this.desc = desc; | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringType) { | ||
| return Optional.of(ParquetValueWriters.strings(desc)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumType) { | ||
| return Optional.of(ParquetValueWriters.strings(desc)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType) { | ||
| switch (desc.getPrimitiveType().getPrimitiveTypeName()) { | ||
| case INT32: | ||
| return Optional.of(ParquetValueWriters.decimalAsInteger( | ||
| desc, decimalType.getPrecision(), decimalType.getScale())); | ||
| case INT64: | ||
| return Optional.of(ParquetValueWriters.decimalAsLong( | ||
| desc, decimalType.getPrecision(), decimalType.getScale())); | ||
| case BINARY: | ||
| case FIXED_LEN_BYTE_ARRAY: | ||
| return Optional.of(ParquetValueWriters.decimalAsFixed( | ||
| desc, decimalType.getPrecision(), decimalType.getScale())); | ||
| } | ||
| return Optional.empty(); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateType) { | ||
| return Optional.of(new DateWriter(desc)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeType) { | ||
| return Optional.of(new TimeWriter(desc)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampType) { | ||
| Preconditions.checkArgument(LogicalTypeAnnotation.TimeUnit.MICROS.equals(timestampType.getUnit()), | ||
| "Cannot write timestamp in %s, only MICROS is supported", timestampType.getUnit()); | ||
| if (timestampType.isAdjustedToUTC()) { | ||
| return Optional.of(new TimestamptzWriter(desc)); | ||
| } else { | ||
| return Optional.of(new TimestampWriter(desc)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intType) { | ||
| Preconditions.checkArgument(intType.isSigned() || intType.getBitWidth() < 64, | ||
| "Cannot read uint64: not a supported Java type"); | ||
| if (intType.getBitWidth() < 64) { | ||
| return Optional.of(ParquetValueWriters.ints(desc)); | ||
| } else { | ||
| return Optional.of(ParquetValueWriters.longs(desc)); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Parquet, LogicalTypeAnnotation.java, there is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Methods that aren't implemented here use the superclass implementation, which returns |
||
| return Optional.of(ParquetValueWriters.strings(desc)); | ||
| } | ||
|
|
||
| @Override | ||
| public Optional<PrimitiveWriter<?>> visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonType) { | ||
| return Optional.of(ParquetValueWriters.byteBuffers(desc)); | ||
| } | ||
| } | ||
|
|
||
| private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); | ||
| private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When writer.isPresent() == false, we falls back to the previous logic, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct.