diff --git a/src/main/java/org/apache/parquet/format/LogicalTypes.java b/src/main/java/org/apache/parquet/format/LogicalTypes.java new file mode 100644 index 000000000..7c63e41da --- /dev/null +++ b/src/main/java/org/apache/parquet/format/LogicalTypes.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.format; + +/** + * Convenience instances of logical type classes. + */ +public class LogicalTypes { + public static class TimeUnits { + public static final TimeUnit MILLIS = TimeUnit.MILLIS(new MilliSeconds()); + public static final TimeUnit MICROS = TimeUnit.MICROS(new MicroSeconds()); + } + + public static LogicalType DECIMAL(int scale, int precision) { + return LogicalType.DECIMAL(new DecimalType(scale, precision)); + } + + public static final LogicalType UTF8 = LogicalType.STRING(new StringType()); + public static final LogicalType MAP = LogicalType.MAP(new MapType()); + public static final LogicalType LIST = LogicalType.LIST(new ListType()); + public static final LogicalType ENUM = LogicalType.ENUM(new EnumType()); + public static final LogicalType DATE = LogicalType.DATE(new DateType()); + public static final LogicalType TIME_MILLIS = LogicalType.TIME(new TimeType(true, TimeUnits.MILLIS)); + public static final LogicalType TIME_MICROS = LogicalType.TIME(new TimeType(true, TimeUnits.MICROS)); + public static final LogicalType TIMESTAMP_MILLIS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MILLIS)); + public static final LogicalType TIMESTAMP_MICROS = LogicalType.TIMESTAMP(new TimestampType(true, TimeUnits.MICROS)); + public static final LogicalType INT_8 = LogicalType.INTEGER(new IntType((byte) 8, true)); + public static final LogicalType INT_16 = LogicalType.INTEGER(new IntType((byte) 16, true)); + public static final LogicalType INT_32 = LogicalType.INTEGER(new IntType((byte) 32, true)); + public static final LogicalType INT_64 = LogicalType.INTEGER(new IntType((byte) 64, true)); + public static final LogicalType UINT_8 = LogicalType.INTEGER(new IntType((byte) 8, false)); + public static final LogicalType UINT_16 = LogicalType.INTEGER(new IntType((byte) 16, false)); + public static final LogicalType UINT_32 = LogicalType.INTEGER(new IntType((byte) 32, false)); + public static final LogicalType UINT_64 = LogicalType.INTEGER(new IntType((byte) 64, false)); + public static final LogicalType UNKNOWN = LogicalType.UNKNOWN(new NullType()); + public static final LogicalType JSON = LogicalType.JSON(new JsonType()); + public static final LogicalType BSON = LogicalType.BSON(new BsonType()); +} diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index d881c740b..4c76cbd97 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -174,13 +174,6 @@ enum ConvertedType { * particular timezone or date. */ INTERVAL = 21; - - /** - * Annotates a column that is always null - * Sometimes when discovering the schema of existing data - * values are always null - */ - NULL = 25; } /** @@ -231,6 +224,114 @@ struct Statistics { 6: optional binary min_value; } +/** Empty structs to use as logical type annotations */ +struct StringType {} // allowed for BINARY, must be encoded with UTF-8 +struct MapType {} // see LogicalTypes.md +struct ListType {} // see LogicalTypes.md +struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 +struct DateType {} // allowed for INT32 + +/** + * Logical type to annotate a column that is always null. + * + * Sometimes when discovering the schema of existing data, values are always + * null and the physical type can't be determined. This annotation signals + * the case where the physical type was guessed from all null values. + */ +struct NullType {} // allowed for any physical type, only null values stored + +/** + * Decimal logical type annotation + * + * To maintain forward-compatibility in v1, implementations using this logical + * type must also set scale and precision on the annotated SchemaElement. + * + * Allowed for physical types: INT32, INT64, FIXED, and BINARY + */ +struct DecimalType { + 1: required i32 scale + 2: required i32 precision +} + +/** Time units for logical types */ +struct MilliSeconds {} +struct MicroSeconds {} +union TimeUnit { + 1: MilliSeconds MILLIS + 2: MicroSeconds MICROS +} + +/** + * Timestamp logical type annotation + * + * Allowed for physical types: INT64 + */ +struct TimestampType { + 1: required bool isAdjustedToUTC + 2: required TimeUnit unit +} + +/** + * Time logical type annotation + * + * Allowed for physical types: INT32 (millis), INT64 (micros) + */ +struct TimeType { + 1: required bool isAdjustedToUTC + 2: required TimeUnit unit +} + +/** + * Integer logical type annotation + * + * bitWidth must be 8, 16, 32, or 64. + * + * Allowed for physical types: INT32, INT64 + */ +struct IntType { + 1: required byte bitWidth + 2: required bool isSigned +} + +/** + * Embedded JSON logical type annotation + * + * Allowed for physical types: BINARY + */ +struct JsonType { +} + +/** + * Embedded BSON logical type annotation + * + * Allowed for physical types: BINARY + */ +struct BsonType { +} + +/** + * LogicalType annotations to replace ConvertedType. + * + * To maintain compatibility, implementations using LogicalType for a + * SchemaElement must also set the corresponding ConvertedType from the + * following table. + */ +union LogicalType { + 1: StringType STRING // use ConvertedType UTF8 if encoding is UTF-8 + 2: MapType MAP // use ConvertedType MAP + 3: ListType LIST // use ConvertedType LIST + 4: EnumType ENUM // use ConvertedType ENUM + 5: DecimalType DECIMAL // use ConvertedType DECIMAL + 6: DateType DATE // use ConvertedType DATE + 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS + 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS + // 9: reserved for INTERVAL + 10: IntType INTEGER // use ConvertedType INT_* or UINT_* + 11: NullType UNKNOWN // no compatible ConvertedType + 12: JsonType JSON // use ConvertedType JSON + 13: BsonType BSON // use ConvertedType BSON +} + /** * Represents a element inside a schema definition. * - if it is a group (inner node) then type is undefined and num_children is defined @@ -278,6 +379,13 @@ struct SchemaElement { */ 9: optional i32 field_id; + /** + * The logical type of this SchemaElement; only valid for primitives. + * + * LogicalType replaces ConvertedType, but ConvertedType is still required + * for some logical types to ensure forward-compatibility in format v1. + */ + 10: optional LogicalType logicalType } /**