-
Notifications
You must be signed in to change notification settings - Fork 462
PARQUET-906: Add LogicalType annotation. #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
86a22b4
993102e
8203b21
190bd8a
c0386e9
02f3868
7cc29f7
ad8e91d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.parquet.format; | ||
|
|
||
| /** | ||
| * Convenience instances of logical type classes. | ||
| */ | ||
| public class LogicalTypes { | ||
| public static class TimeUnits { | ||
| public static final TimeUnit MILLIS = TimeUnit.MILLIS(new MilliSeconds()); | ||
| public static final TimeUnit MICROS = TimeUnit.MICROS(new MicroSeconds()); | ||
| } | ||
|
|
||
| public static LogicalType DECIMAL(int scale, int precision) { | ||
| return LogicalType.DECIMAL(new DecimalType(scale, precision)); | ||
| } | ||
|
|
||
| public static final LogicalType UTF8 = LogicalType.STRING(new StringType()); | ||
| public static final LogicalType MAP = LogicalType.MAP(new MapType()); | ||
| public static final LogicalType LIST = LogicalType.LIST(new ListType()); | ||
| public static final LogicalType ENUM = LogicalType.ENUM(new EnumType()); | ||
| public static final LogicalType DATE = LogicalType.DATE(new DateType()); | ||
| public static final LogicalType TIME_MILLIS = LogicalType.TIME(new TimeType(true, TimeUnits.MILLIS)); | ||
| public static final LogicalType TIME_MICROS = LogicalType.TIME(new TimeType(true, TimeUnits.MICROS)); | ||
| public static final LogicalType TIMESTAMP_MILLIS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MILLIS)); | ||
| public static final LogicalType TIMESTAMP_MICROS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MICROS)); | ||
| public static final LogicalType INT_8 = LogicalType.INTEGER(new IntType((byte) 8, true)); | ||
| public static final LogicalType INT_16 = LogicalType.INTEGER(new IntType((byte) 16, true)); | ||
| public static final LogicalType INT_32 = LogicalType.INTEGER(new IntType((byte) 32, true)); | ||
| public static final LogicalType INT_64 = LogicalType.INTEGER(new IntType((byte) 64, true)); | ||
| public static final LogicalType UINT_8 = LogicalType.INTEGER(new IntType((byte) 8, false)); | ||
| public static final LogicalType UINT_16 = LogicalType.INTEGER(new IntType((byte) 16, false)); | ||
| public static final LogicalType UINT_32 = LogicalType.INTEGER(new IntType((byte) 32, false)); | ||
| public static final LogicalType UINT_64 = LogicalType.INTEGER(new IntType((byte) 64, false)); | ||
| public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType()); | ||
| public static final LogicalType JSON = LogicalType.JSON(new JsonType()); | ||
| public static final LogicalType BSON = LogicalType.BSON(new BsonType()); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -174,13 +174,6 @@ enum ConvertedType { | |
| * particular timezone or date. | ||
| */ | ||
| INTERVAL = 21; | ||
|
|
||
| /** | ||
| * Annotates a column that is always null | ||
| * Sometimes when discovering the schema of existing data | ||
| * values are always null | ||
| */ | ||
| NULL = 25; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -231,6 +224,113 @@ struct Statistics { | |
| 6: optional binary min_value; | ||
| } | ||
|
|
||
| /** Empty structs to use as logical type annotations */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think think it would help the reader if the comments for each logical type mention the physical type(s) they can annotate. Currently one needs to look them up in the table below, and then look in LogicalTypes.md or ConvertedType to find what physical types can be annotated.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
| struct StringType {} // allowed for BINARY, must be encoded with UTF-8 | ||
| struct MapType {} // see LogicalTypes.md | ||
| struct ListType {} // see LogicalTypes.md | ||
| struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 | ||
| struct DateType {} // allowed for INT32 | ||
|
|
||
| /** | ||
| * Logical type to annotate a column that is always null. | ||
| * | ||
| * Sometimes when discovering the schema of existing data values are always | ||
|
||
| * null and the physical type is assumed. | ||
| */ | ||
| struct NullType {} // allowed for any physical type, only null values stored | ||
|
|
||
| /** | ||
| * Decimal logical type annotation | ||
| * | ||
| * To maintain forward-compatibility in v1, implementations using this logical | ||
| * type must also set scale and precision on the annotated SchemaElement. | ||
| * | ||
| * Allowed for physical types: INT32, INT64, FIXED, and BINARY | ||
| */ | ||
| struct DecimalType { | ||
| 1: required i32 scale | ||
| 2: required i32 precision | ||
| } | ||
|
|
||
| /** Time units for logical types */ | ||
| struct MilliSeconds {} | ||
| struct MicroSeconds {} | ||
| union TimeUnit { | ||
| 1: MilliSeconds MILLIS | ||
| 2: MicroSeconds MICROS | ||
| } | ||
|
|
||
| /** | ||
| * Timestamp logical type annotation | ||
| * | ||
| * Allowed for physical types: INT64 | ||
| */ | ||
| struct TimestampType { | ||
| 1: required bool isAdjustedToUTC | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about "withoutTimeZone" meaning the opposite of "isAdjustedToUTC"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isAdjustedToUTC is more clear. With or without time zone is language from the SQL spec, which is very difficult to understand and apply. Rather than relying on it, this captures a very specific piece of information: whether the timestamp was adjusted to UTC from its original offset (or would have been if the original offset is UTC).
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if isAdjusted to UTC is true, we need to record the original TZ. So we should add an optional String timezone field. (and then we don't need isAdjustedToUTC since it is implied by timezone != null ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The model we discussed for Parquet is different from the Arrow spec.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This isn't accurate, see: https://github.com/apache/arrow/blob/master/format/Schema.fbs#L120. In Arrow, if the timestamp metadata does not have a time zone, it is time zone naive, not UTC. From naive timestamp values, we can choose later to localize to UTC (which is a no-op) or localize to some other time zone (which will adjust the values to the UTC values internally based on the tz database)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened https://issues.apache.org/jira/browse/ARROW-1020 to clarify in the comments in Schema.fbs When timezone is set, the physical values are UTC timestamps regardless of the time zone of the writer or the client, so changing the timezone does not change the underlying integers.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need to store the original zone if isAdjustedToUTC is true. This isn't done by other systems, and there is no guarantee that there is a single zone that the timestamps were converted from. This just indicates that whatever the source zone, the values have been normalized to the same one, UTC.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I linked the parquet and arrow jiras about timestamp types. There's a discution there about timestamp types and timezones:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We spoke about this on the Arrow sync yesterday. In Arrow, we have 3 cases:
What is proposed here simplifies this to either The problem I see here is that if a data processing system runs the query: select hour(timestamp_field), count(*)
from my_parquet_data
group by 1For timestamps with time zone, if the time zone is known then the Or maybe the analytics system has some means to cast to a timestamp with the additional metadata. Either way there's some awkwardness.
I see this concern, but if there is no consistency about the origin of the data, why not store "UTC" as the storage time zone? The fact that others may preserve a local time zone need not complicate this use case. We can use the keyvalue metadata to preserve the time zone metadata in the Parquet file footer to at least maintain compatibility with Arrow, but it's not ideal.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've resolved this discussion in the Parquet sync-ups. Parquet timestamps are always stored with respect to UTC and won't have a time zone. |
||
| 2: required TimeUnit unit | ||
| } | ||
|
|
||
| /** | ||
| * Time logical type annotation | ||
| * | ||
| * Allowed for physical types: INT32 (millis), INT64 (micros) | ||
| */ | ||
| struct TimeType { | ||
| 1: required bool isAdjustedToUTC | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do this actually apply to Time? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Applies to at least PostgreSQL
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is required by the SQL spec |
||
| 2: required TimeUnit unit | ||
| } | ||
|
|
||
| /** | ||
| * Integer logical type annotation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed in addition to the physical types? Could it just be UnsignedIntType and leave out the bitWidth?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unless we want to support other widths in the future?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should talk about this in the sync-up today.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I didn't make it to that sync. Was there some conclusion?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bit width is needed to signal that all of the values will fit in a particular width. Since the spec currently has 8 and 16 bit widths, we have to capture those possible values.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is consistent with the Arrow integer metadata https://github.com/apache/arrow/blob/master/format/Schema.fbs#L87 |
||
| * | ||
| * bitWidth must be 8, 16, 32, or 64. | ||
| * | ||
| * Allowed for physical types: INT32, INT64 | ||
| */ | ||
| struct IntType { | ||
| 1: required byte bitWidth | ||
| 2: required bool isSigned | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just signed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine either way. |
||
| } | ||
|
|
||
| /** | ||
| * Embedded JSON logical type annotation | ||
| * | ||
| * Allowed for physical types: BINARY | ||
| */ | ||
| struct JsonType { | ||
| } | ||
|
|
||
| /** | ||
| * Embedded BSON logical type annotation | ||
| * | ||
| * Allowed for physical types: BINARY | ||
| */ | ||
| struct BsonType { | ||
| } | ||
|
|
||
| /** | ||
| * LogicalType annotations to replace ConvertedType. | ||
| * | ||
| * To maintain compatibility, implementations using LogicalType for a | ||
| * SchemaElement must also set the corresponding ConvertedType from the | ||
| * following table. | ||
| */ | ||
| union LogicalType { | ||
| 1: StringType STRING // use ConvertedType UTF8 if encoding is UTF-8 | ||
| 2: MapType MAP // use ConvertedType MAP | ||
| 3: ListType LIST // use ConvertedType LIST | ||
| 4: EnumType ENUM // use ConvertedType ENUM | ||
| 5: DecimalType DECIMAL // use ConvertedType DECIMAL | ||
| 6: DateType DATE // use ConvertedType DATE | ||
| 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS | ||
| 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS | ||
| // 9: reserved for INTERVAL | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about creating an empty struct IntervalType and leaving a todo with that struct instead?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but then we can not add required fields?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with Julien. Then we can't add required fields to interval and there isn't much value to adding it now. |
||
| 10: IntType INTEGER // use ConvertedType INT_* or UINT_* | ||
| 11: NullType UNKNOWN // no compatible ConvertedType | ||
| 12: JsonType JSON // use ConvertedType JSON | ||
| 13: BsonType BSON // use ConvertedType BSON | ||
| } | ||
|
|
||
| /** | ||
| * Represents a element inside a schema definition. | ||
| * - if it is a group (inner node) then type is undefined and num_children is defined | ||
|
|
@@ -278,6 +378,13 @@ struct SchemaElement { | |
| */ | ||
| 9: optional i32 field_id; | ||
|
|
||
| /** | ||
| * The logical type of this SchemaElement; only valid for primitives. | ||
| * | ||
| * LogicalType replaces ConvertedType, but ConvertedType is still required | ||
| * for some logical types to ensure forward-compatibility in format v1. | ||
| */ | ||
| 10: optional LogicalType logicalType | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just noticed that this commit uses camelCase field names while the existing ones are underscore_separated. Was this an intentional naming convention change? In any case, it's too late to do anything about it as this has already been released as 2.4.0. |
||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you don't want to duplicate the comment, maybe refer to NullType below
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I intended to remove NULL entirely because it is an unreleased type that can be replaced with the LogicalType version.