diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index dcdfbcbe7b0..83ae04215b5 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -64,7 +64,7 @@ use crate::arrow::converter::{ }; use crate::arrow::record_reader::RecordReader; use crate::arrow::schema::parquet_to_arrow_field; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::column::page::PageIterator; use crate::column::reader::ColumnReaderImpl; use crate::data_type::{ @@ -1463,7 +1463,7 @@ impl<'a> ArrayReaderBuilder { )?)) } PhysicalType::BYTE_ARRAY => { - if cur_type.get_basic_info().logical_type() == LogicalType::UTF8 { + if cur_type.get_basic_info().converted_type() == ConvertedType::UTF8 { if let Some(ArrowType::LargeUtf8) = arrow_type { let converter = LargeUtf8Converter::new(LargeUtf8ArrayConverter {}); @@ -1514,7 +1514,8 @@ impl<'a> ArrayReaderBuilder { } } PhysicalType::FIXED_LEN_BYTE_ARRAY - if cur_type.get_basic_info().logical_type() == LogicalType::DECIMAL => + if cur_type.get_basic_info().converted_type() + == ConvertedType::DECIMAL => { let converter = DecimalConverter::new(DecimalArrayConverter::new( cur_type.get_precision(), @@ -1531,7 +1532,7 @@ impl<'a> ArrayReaderBuilder { )?)) } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - if cur_type.get_basic_info().logical_type() == LogicalType::INTERVAL { + if cur_type.get_basic_info().converted_type() == ConvertedType::INTERVAL { let byte_width = match *cur_type { Type::PrimitiveType { ref type_length, .. @@ -1888,14 +1889,14 @@ mod tests { } macro_rules! test_primitive_array_reader_one_type { - ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ + ($arrow_parquet_type:ty, $physical_type:expr, $converted_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ let message_type = format!( " message test_schema {{ REQUIRED {:?} leaf ({}); }} ", - $physical_type, $logical_type_str + $physical_type, $converted_type_str ); let schema = parse_message_type(&message_type) .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index fa973b5cc0e..84b7a4daf66 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -33,7 +33,7 @@ use crate::errors::{ParquetError::ArrowError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; use crate::{ - basic::{LogicalType, Repetition, Type as PhysicalType}, + basic::{ConvertedType, Repetition, Type as PhysicalType}, errors::ParquetError, }; @@ -321,18 +321,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result { // create type from field match field.data_type() { DataType::Null => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::NONE) + .with_converted_type(ConvertedType::NONE) .with_repetition(repetition) .build(), DataType::Boolean => Type::primitive_type_builder(name, PhysicalType::BOOLEAN) .with_repetition(repetition) .build(), DataType::Int8 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::INT_8) + .with_converted_type(ConvertedType::INT_8) .with_repetition(repetition) .build(), DataType::Int16 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::INT_16) + .with_converted_type(ConvertedType::INT_16) .with_repetition(repetition) .build(), DataType::Int32 => Type::primitive_type_builder(name, PhysicalType::INT32) @@ -342,19 +342,19 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build(), DataType::UInt8 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .with_repetition(repetition) .build(), DataType::UInt16 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_16) + .with_converted_type(ConvertedType::UINT_16) .with_repetition(repetition) .build(), DataType::UInt32 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_32) + .with_converted_type(ConvertedType::UINT_32) .with_repetition(repetition) .build(), DataType::UInt64 => Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(LogicalType::UINT_64) + .with_converted_type(ConvertedType::UINT_64) .with_repetition(repetition) .build(), DataType::Float16 => Err(ArrowError("Float16 arrays not supported".to_string())), @@ -366,30 +366,30 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .build(), DataType::Timestamp(time_unit, _) => { Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(match time_unit { - TimeUnit::Second => LogicalType::TIMESTAMP_MILLIS, - TimeUnit::Millisecond => LogicalType::TIMESTAMP_MILLIS, - TimeUnit::Microsecond => LogicalType::TIMESTAMP_MICROS, - TimeUnit::Nanosecond => LogicalType::TIMESTAMP_MICROS, + .with_converted_type(match time_unit { + TimeUnit::Second => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::Millisecond => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::Microsecond => ConvertedType::TIMESTAMP_MICROS, + TimeUnit::Nanosecond => ConvertedType::TIMESTAMP_MICROS, }) .with_repetition(repetition) .build() } DataType::Date32 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .with_repetition(repetition) .build(), // date64 is cast to date32 DataType::Date64 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .with_repetition(repetition) .build(), DataType::Time32(_) => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::TIME_MILLIS) + .with_converted_type(ConvertedType::TIME_MILLIS) .with_repetition(repetition) .build(), DataType::Time64(_) => Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(LogicalType::TIME_MICROS) + .with_converted_type(ConvertedType::TIME_MICROS) .with_repetition(repetition) .build(), DataType::Duration(_) => Err(ArrowError( @@ -397,7 +397,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { )), DataType::Interval(_) => { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_repetition(repetition) .with_length(12) .build() @@ -417,14 +417,14 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(repetition) .with_length(decimal_length_from_precision(*precision) as i32) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(*precision as i32) .with_scale(*scale as i32) .build() } DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_repetition(repetition) .build() } @@ -436,7 +436,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(Repetition::REPEATED) .build()?, )]) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_repetition(repetition) .build() } @@ -583,17 +583,17 @@ impl ParquetTypeConverter<'_> { } fn from_int32(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Int32), - LogicalType::UINT_8 => Ok(DataType::UInt8), - LogicalType::UINT_16 => Ok(DataType::UInt16), - LogicalType::UINT_32 => Ok(DataType::UInt32), - LogicalType::INT_8 => Ok(DataType::Int8), - LogicalType::INT_16 => Ok(DataType::Int16), - LogicalType::INT_32 => Ok(DataType::Int32), - LogicalType::DATE => Ok(DataType::Date32), - LogicalType::TIME_MILLIS => Ok(DataType::Time32(TimeUnit::Millisecond)), - LogicalType::DECIMAL => Ok(self.to_decimal()), + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Int32), + ConvertedType::UINT_8 => Ok(DataType::UInt8), + ConvertedType::UINT_16 => Ok(DataType::UInt16), + ConvertedType::UINT_32 => Ok(DataType::UInt32), + ConvertedType::INT_8 => Ok(DataType::Int8), + ConvertedType::INT_16 => Ok(DataType::Int16), + ConvertedType::INT_32 => Ok(DataType::Int32), + ConvertedType::DATE => Ok(DataType::Date32), + ConvertedType::TIME_MILLIS => Ok(DataType::Time32(TimeUnit::Millisecond)), + ConvertedType::DECIMAL => Ok(self.to_decimal()), other => Err(ArrowError(format!( "Unable to convert parquet INT32 logical type {}", other @@ -602,18 +602,18 @@ impl ParquetTypeConverter<'_> { } fn from_int64(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Int64), - LogicalType::INT_64 => Ok(DataType::Int64), - LogicalType::UINT_64 => Ok(DataType::UInt64), - LogicalType::TIME_MICROS => Ok(DataType::Time64(TimeUnit::Microsecond)), - LogicalType::TIMESTAMP_MILLIS => { + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Int64), + ConvertedType::INT_64 => Ok(DataType::Int64), + ConvertedType::UINT_64 => Ok(DataType::UInt64), + ConvertedType::TIME_MICROS => Ok(DataType::Time64(TimeUnit::Microsecond)), + ConvertedType::TIMESTAMP_MILLIS => { Ok(DataType::Timestamp(TimeUnit::Millisecond, None)) } - LogicalType::TIMESTAMP_MICROS => { + ConvertedType::TIMESTAMP_MICROS => { Ok(DataType::Timestamp(TimeUnit::Microsecond, None)) } - LogicalType::DECIMAL => Ok(self.to_decimal()), + ConvertedType::DECIMAL => Ok(self.to_decimal()), other => Err(ArrowError(format!( "Unable to convert parquet INT64 logical type {}", other @@ -622,9 +622,9 @@ impl ParquetTypeConverter<'_> { } fn from_fixed_len_byte_array(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::DECIMAL => Ok(self.to_decimal()), - LogicalType::INTERVAL => { + match self.schema.get_basic_info().converted_type() { + ConvertedType::DECIMAL => Ok(self.to_decimal()), + ConvertedType::INTERVAL => { // There is currently no reliable way of determining which IntervalUnit // to return. Thus without the original Arrow schema, the results // would be incorrect if all 12 bytes of the interval are populated @@ -656,9 +656,9 @@ impl ParquetTypeConverter<'_> { } fn from_byte_array(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Binary), - LogicalType::UTF8 => Ok(DataType::Utf8), + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Binary), + ConvertedType::UTF8 => Ok(DataType::Utf8), other => Err(ArrowError(format!( "Unable to convert parquet BYTE_ARRAY logical type {}", other @@ -683,8 +683,8 @@ impl ParquetTypeConverter<'_> { }) }) } else { - match self.schema.get_basic_info().logical_type() { - LogicalType::LIST => self.to_list(), + match self.schema.get_basic_info().converted_type() { + ConvertedType::LIST => self.to_list(), _ => self.to_struct(), } } diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index bf41d43da90..73142f75052 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -24,6 +24,12 @@ use parquet_format as parquet; use crate::errors::ParquetError; +// Re-export parquet_format types used in this module +pub use parquet_format::{ + BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, + NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, +}; + // ---------------------------------------------------------------------- // Types from the Thrift definition @@ -50,11 +56,11 @@ pub enum Type { // ---------------------------------------------------------------------- // Mirrors `parquet::ConvertedType` -/// Common types (logical types) used by frameworks when using Parquet. +/// Common types (converted types) used by frameworks when using Parquet. /// This helps map between types in those frameworks to the base types in Parquet. /// This is only metadata and not needed to read or write the data. #[derive(Debug, Clone, Copy, PartialEq)] -pub enum LogicalType { +pub enum ConvertedType { NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. UTF8, @@ -146,6 +152,27 @@ pub enum LogicalType { INTERVAL, } +// ---------------------------------------------------------------------- +// Mirrors `parquet::LogicalType` + +/// Logical types used by version 2 of the Parquet format. +#[derive(Debug, Clone, PartialEq)] +pub enum LogicalType { + STRING(StringType), + MAP(MapType), + LIST(ListType), + ENUM(EnumType), + DECIMAL(DecimalType), + DATE(DateType), + TIME(TimeType), + TIMESTAMP(TimestampType), + INTEGER(IntType), + UNKNOWN(NullType), + JSON(JsonType), + BSON(BsonType), + UUID(UUIDType), +} + // ---------------------------------------------------------------------- // Mirrors `parquet::FieldRepetitionType` @@ -284,42 +311,74 @@ pub enum ColumnOrder { impl ColumnOrder { /// Returns sort order for a physical/logical type. - pub fn get_sort_order(logical_type: LogicalType, physical_type: Type) -> SortOrder { + pub fn get_sort_order( + logical_type: Option, + converted_type: ConvertedType, + physical_type: Type, + ) -> SortOrder { + // TODO: Should this take converted and logical type, for compatibility? match logical_type { + Some(logical) => match logical { + LogicalType::STRING(_) + | LogicalType::ENUM(_) + | LogicalType::JSON(_) + | LogicalType::BSON(_) => SortOrder::UNSIGNED, + LogicalType::INTEGER(t) => match t.is_signed { + true => SortOrder::SIGNED, + false => SortOrder::UNSIGNED, + }, + LogicalType::MAP(_) | LogicalType::LIST(_) => SortOrder::UNDEFINED, + LogicalType::DECIMAL(_) => SortOrder::SIGNED, + LogicalType::DATE(_) => SortOrder::SIGNED, + LogicalType::TIME(_) => SortOrder::SIGNED, + LogicalType::TIMESTAMP(_) => SortOrder::SIGNED, + LogicalType::UNKNOWN(_) => SortOrder::UNDEFINED, + LogicalType::UUID(_) => SortOrder::UNSIGNED, + }, + // Fall back to converted type + None => Self::get_converted_sort_order(converted_type, physical_type), + } + } + + fn get_converted_sort_order( + converted_type: ConvertedType, + physical_type: Type, + ) -> SortOrder { + match converted_type { // Unsigned byte-wise comparison. - LogicalType::UTF8 - | LogicalType::JSON - | LogicalType::BSON - | LogicalType::ENUM => SortOrder::UNSIGNED, + ConvertedType::UTF8 + | ConvertedType::JSON + | ConvertedType::BSON + | ConvertedType::ENUM => SortOrder::UNSIGNED, - LogicalType::INT_8 - | LogicalType::INT_16 - | LogicalType::INT_32 - | LogicalType::INT_64 => SortOrder::SIGNED, + ConvertedType::INT_8 + | ConvertedType::INT_16 + | ConvertedType::INT_32 + | ConvertedType::INT_64 => SortOrder::SIGNED, - LogicalType::UINT_8 - | LogicalType::UINT_16 - | LogicalType::UINT_32 - | LogicalType::UINT_64 => SortOrder::UNSIGNED, + ConvertedType::UINT_8 + | ConvertedType::UINT_16 + | ConvertedType::UINT_32 + | ConvertedType::UINT_64 => SortOrder::UNSIGNED, // Signed comparison of the represented value. - LogicalType::DECIMAL => SortOrder::SIGNED, + ConvertedType::DECIMAL => SortOrder::SIGNED, - LogicalType::DATE => SortOrder::SIGNED, + ConvertedType::DATE => SortOrder::SIGNED, - LogicalType::TIME_MILLIS - | LogicalType::TIME_MICROS - | LogicalType::TIMESTAMP_MILLIS - | LogicalType::TIMESTAMP_MICROS => SortOrder::SIGNED, + ConvertedType::TIME_MILLIS + | ConvertedType::TIME_MICROS + | ConvertedType::TIMESTAMP_MILLIS + | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED, - LogicalType::INTERVAL => SortOrder::UNSIGNED, + ConvertedType::INTERVAL => SortOrder::UNDEFINED, - LogicalType::LIST | LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { SortOrder::UNDEFINED } // Fall back to physical type. - LogicalType::NONE => Self::get_default_sort_order(physical_type), + ConvertedType::NONE => Self::get_default_sort_order(physical_type), } } @@ -337,7 +396,7 @@ impl ColumnOrder { // If the max is -0, the row group may contain +0 values as well. // When looking for NaN values, min and max should be ignored. Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, - // unsigned byte-wise comparison + // Unsigned byte-wise comparison Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, } } @@ -357,7 +416,7 @@ impl fmt::Display for Type { } } -impl fmt::Display for LogicalType { +impl fmt::Display for ConvertedType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", self) } @@ -433,70 +492,167 @@ impl convert::From for parquet::Type { } // ---------------------------------------------------------------------- -// parquet::ConvertedType <=> LogicalType conversion +// parquet::ConvertedType <=> ConvertedType conversion -impl convert::From> for LogicalType { +impl convert::From> for ConvertedType { fn from(option: Option) -> Self { match option { - None => LogicalType::NONE, + None => ConvertedType::NONE, Some(value) => match value { - parquet::ConvertedType::Utf8 => LogicalType::UTF8, - parquet::ConvertedType::Map => LogicalType::MAP, - parquet::ConvertedType::MapKeyValue => LogicalType::MAP_KEY_VALUE, - parquet::ConvertedType::List => LogicalType::LIST, - parquet::ConvertedType::Enum => LogicalType::ENUM, - parquet::ConvertedType::Decimal => LogicalType::DECIMAL, - parquet::ConvertedType::Date => LogicalType::DATE, - parquet::ConvertedType::TimeMillis => LogicalType::TIME_MILLIS, - parquet::ConvertedType::TimeMicros => LogicalType::TIME_MICROS, - parquet::ConvertedType::TimestampMillis => LogicalType::TIMESTAMP_MILLIS, - parquet::ConvertedType::TimestampMicros => LogicalType::TIMESTAMP_MICROS, - parquet::ConvertedType::Uint8 => LogicalType::UINT_8, - parquet::ConvertedType::Uint16 => LogicalType::UINT_16, - parquet::ConvertedType::Uint32 => LogicalType::UINT_32, - parquet::ConvertedType::Uint64 => LogicalType::UINT_64, - parquet::ConvertedType::Int8 => LogicalType::INT_8, - parquet::ConvertedType::Int16 => LogicalType::INT_16, - parquet::ConvertedType::Int32 => LogicalType::INT_32, - parquet::ConvertedType::Int64 => LogicalType::INT_64, - parquet::ConvertedType::Json => LogicalType::JSON, - parquet::ConvertedType::Bson => LogicalType::BSON, - parquet::ConvertedType::Interval => LogicalType::INTERVAL, + parquet::ConvertedType::Utf8 => ConvertedType::UTF8, + parquet::ConvertedType::Map => ConvertedType::MAP, + parquet::ConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, + parquet::ConvertedType::List => ConvertedType::LIST, + parquet::ConvertedType::Enum => ConvertedType::ENUM, + parquet::ConvertedType::Decimal => ConvertedType::DECIMAL, + parquet::ConvertedType::Date => ConvertedType::DATE, + parquet::ConvertedType::TimeMillis => ConvertedType::TIME_MILLIS, + parquet::ConvertedType::TimeMicros => ConvertedType::TIME_MICROS, + parquet::ConvertedType::TimestampMillis => { + ConvertedType::TIMESTAMP_MILLIS + } + parquet::ConvertedType::TimestampMicros => { + ConvertedType::TIMESTAMP_MICROS + } + parquet::ConvertedType::Uint8 => ConvertedType::UINT_8, + parquet::ConvertedType::Uint16 => ConvertedType::UINT_16, + parquet::ConvertedType::Uint32 => ConvertedType::UINT_32, + parquet::ConvertedType::Uint64 => ConvertedType::UINT_64, + parquet::ConvertedType::Int8 => ConvertedType::INT_8, + parquet::ConvertedType::Int16 => ConvertedType::INT_16, + parquet::ConvertedType::Int32 => ConvertedType::INT_32, + parquet::ConvertedType::Int64 => ConvertedType::INT_64, + parquet::ConvertedType::Json => ConvertedType::JSON, + parquet::ConvertedType::Bson => ConvertedType::BSON, + parquet::ConvertedType::Interval => ConvertedType::INTERVAL, }, } } } -impl convert::From for Option { - fn from(value: LogicalType) -> Self { +impl convert::From for Option { + fn from(value: ConvertedType) -> Self { match value { - LogicalType::NONE => None, - LogicalType::UTF8 => Some(parquet::ConvertedType::Utf8), - LogicalType::MAP => Some(parquet::ConvertedType::Map), - LogicalType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), - LogicalType::LIST => Some(parquet::ConvertedType::List), - LogicalType::ENUM => Some(parquet::ConvertedType::Enum), - LogicalType::DECIMAL => Some(parquet::ConvertedType::Decimal), - LogicalType::DATE => Some(parquet::ConvertedType::Date), - LogicalType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), - LogicalType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), - LogicalType::TIMESTAMP_MILLIS => { + ConvertedType::NONE => None, + ConvertedType::UTF8 => Some(parquet::ConvertedType::Utf8), + ConvertedType::MAP => Some(parquet::ConvertedType::Map), + ConvertedType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), + ConvertedType::LIST => Some(parquet::ConvertedType::List), + ConvertedType::ENUM => Some(parquet::ConvertedType::Enum), + ConvertedType::DECIMAL => Some(parquet::ConvertedType::Decimal), + ConvertedType::DATE => Some(parquet::ConvertedType::Date), + ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), + ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), + ConvertedType::TIMESTAMP_MILLIS => { Some(parquet::ConvertedType::TimestampMillis) } - LogicalType::TIMESTAMP_MICROS => { + ConvertedType::TIMESTAMP_MICROS => { Some(parquet::ConvertedType::TimestampMicros) } - LogicalType::UINT_8 => Some(parquet::ConvertedType::Uint8), - LogicalType::UINT_16 => Some(parquet::ConvertedType::Uint16), - LogicalType::UINT_32 => Some(parquet::ConvertedType::Uint32), - LogicalType::UINT_64 => Some(parquet::ConvertedType::Uint64), - LogicalType::INT_8 => Some(parquet::ConvertedType::Int8), - LogicalType::INT_16 => Some(parquet::ConvertedType::Int16), - LogicalType::INT_32 => Some(parquet::ConvertedType::Int32), - LogicalType::INT_64 => Some(parquet::ConvertedType::Int64), - LogicalType::JSON => Some(parquet::ConvertedType::Json), - LogicalType::BSON => Some(parquet::ConvertedType::Bson), - LogicalType::INTERVAL => Some(parquet::ConvertedType::Interval), + ConvertedType::UINT_8 => Some(parquet::ConvertedType::Uint8), + ConvertedType::UINT_16 => Some(parquet::ConvertedType::Uint16), + ConvertedType::UINT_32 => Some(parquet::ConvertedType::Uint32), + ConvertedType::UINT_64 => Some(parquet::ConvertedType::Uint64), + ConvertedType::INT_8 => Some(parquet::ConvertedType::Int8), + ConvertedType::INT_16 => Some(parquet::ConvertedType::Int16), + ConvertedType::INT_32 => Some(parquet::ConvertedType::Int32), + ConvertedType::INT_64 => Some(parquet::ConvertedType::Int64), + ConvertedType::JSON => Some(parquet::ConvertedType::Json), + ConvertedType::BSON => Some(parquet::ConvertedType::Bson), + ConvertedType::INTERVAL => Some(parquet::ConvertedType::Interval), + } + } +} + +// ---------------------------------------------------------------------- +// parquet::LogicalType <=> LogicalType conversion + +impl convert::From for LogicalType { + fn from(value: parquet::LogicalType) -> Self { + match value { + parquet::LogicalType::STRING(t) => LogicalType::STRING(t), + parquet::LogicalType::MAP(t) => LogicalType::MAP(t), + parquet::LogicalType::LIST(t) => LogicalType::LIST(t), + parquet::LogicalType::ENUM(t) => LogicalType::ENUM(t), + parquet::LogicalType::DECIMAL(t) => LogicalType::DECIMAL(t), + parquet::LogicalType::DATE(t) => LogicalType::DATE(t), + parquet::LogicalType::TIME(t) => LogicalType::TIME(t), + parquet::LogicalType::TIMESTAMP(t) => LogicalType::TIMESTAMP(t), + parquet::LogicalType::INTEGER(t) => LogicalType::INTEGER(t), + parquet::LogicalType::UNKNOWN(t) => LogicalType::UNKNOWN(t), + parquet::LogicalType::JSON(t) => LogicalType::JSON(t), + parquet::LogicalType::BSON(t) => LogicalType::BSON(t), + parquet::LogicalType::UUID(t) => LogicalType::UUID(t), + } + } +} + +impl convert::From for parquet::LogicalType { + fn from(value: LogicalType) -> Self { + match value { + LogicalType::STRING(t) => parquet::LogicalType::STRING(t), + LogicalType::MAP(t) => parquet::LogicalType::MAP(t), + LogicalType::LIST(t) => parquet::LogicalType::LIST(t), + LogicalType::ENUM(t) => parquet::LogicalType::ENUM(t), + LogicalType::DECIMAL(t) => parquet::LogicalType::DECIMAL(t), + LogicalType::DATE(t) => parquet::LogicalType::DATE(t), + LogicalType::TIME(t) => parquet::LogicalType::TIME(t), + LogicalType::TIMESTAMP(t) => parquet::LogicalType::TIMESTAMP(t), + LogicalType::INTEGER(t) => parquet::LogicalType::INTEGER(t), + LogicalType::UNKNOWN(t) => parquet::LogicalType::UNKNOWN(t), + LogicalType::JSON(t) => parquet::LogicalType::JSON(t), + LogicalType::BSON(t) => parquet::LogicalType::BSON(t), + LogicalType::UUID(t) => parquet::LogicalType::UUID(t), + } + } +} + +// ---------------------------------------------------------------------- +// LogicalType <=> ConvertedType conversion + +// Note: To prevent type loss when converting from ConvertedType to LogicalType, +// the conversion from ConvertedType -> LogicalType is not implemented. +// Such type loss includes: +// - Not knowing the decimal scale and precision of ConvertedType +// - Time and timestamp nanosecond precision, that is not supported in ConvertedType. + +impl From> for ConvertedType { + fn from(value: Option) -> Self { + match value { + Some(value) => match value { + LogicalType::STRING(_) => ConvertedType::UTF8, + LogicalType::MAP(_) => ConvertedType::MAP, + LogicalType::LIST(_) => ConvertedType::LIST, + LogicalType::ENUM(_) => ConvertedType::ENUM, + LogicalType::DECIMAL(_) => ConvertedType::DECIMAL, + LogicalType::DATE(_) => ConvertedType::DATE, + LogicalType::TIME(t) => match t.unit { + TimeUnit::MILLIS(_) => ConvertedType::TIME_MILLIS, + TimeUnit::MICROS(_) => ConvertedType::TIME_MICROS, + TimeUnit::NANOS(_) => ConvertedType::NONE, + }, + LogicalType::TIMESTAMP(t) => match t.unit { + TimeUnit::MILLIS(_) => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::MICROS(_) => ConvertedType::TIMESTAMP_MICROS, + TimeUnit::NANOS(_) => ConvertedType::NONE, + }, + LogicalType::INTEGER(t) => match (t.bit_width, t.is_signed) { + (8, true) => ConvertedType::INT_8, + (16, true) => ConvertedType::INT_16, + (32, true) => ConvertedType::INT_32, + (64, true) => ConvertedType::INT_64, + (8, false) => ConvertedType::UINT_8, + (16, false) => ConvertedType::UINT_16, + (32, false) => ConvertedType::UINT_32, + (64, false) => ConvertedType::UINT_64, + t => panic!("Integer type {:?} is not supported", t), + }, + LogicalType::UNKNOWN(_) => ConvertedType::NONE, + LogicalType::JSON(_) => ConvertedType::JSON, + LogicalType::BSON(_) => ConvertedType::BSON, + LogicalType::UUID(_) => ConvertedType::NONE, + }, + None => ConvertedType::NONE, } } } @@ -647,34 +803,128 @@ impl str::FromStr for Type { } } +impl str::FromStr for ConvertedType { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "NONE" => Ok(ConvertedType::NONE), + "UTF8" => Ok(ConvertedType::UTF8), + "MAP" => Ok(ConvertedType::MAP), + "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), + "LIST" => Ok(ConvertedType::LIST), + "ENUM" => Ok(ConvertedType::ENUM), + "DECIMAL" => Ok(ConvertedType::DECIMAL), + "DATE" => Ok(ConvertedType::DATE), + "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS), + "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS), + "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS), + "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS), + "UINT_8" => Ok(ConvertedType::UINT_8), + "UINT_16" => Ok(ConvertedType::UINT_16), + "UINT_32" => Ok(ConvertedType::UINT_32), + "UINT_64" => Ok(ConvertedType::UINT_64), + "INT_8" => Ok(ConvertedType::INT_8), + "INT_16" => Ok(ConvertedType::INT_16), + "INT_32" => Ok(ConvertedType::INT_32), + "INT_64" => Ok(ConvertedType::INT_64), + "JSON" => Ok(ConvertedType::JSON), + "BSON" => Ok(ConvertedType::BSON), + "INTERVAL" => Ok(ConvertedType::INTERVAL), + other => Err(general_err!("Invalid converted type {}", other)), + } + } +} + impl str::FromStr for LogicalType { type Err = ParquetError; fn from_str(s: &str) -> result::Result { match s { - "NONE" => Ok(LogicalType::NONE), - "UTF8" => Ok(LogicalType::UTF8), - "MAP" => Ok(LogicalType::MAP), - "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE), - "LIST" => Ok(LogicalType::LIST), - "ENUM" => Ok(LogicalType::ENUM), - "DECIMAL" => Ok(LogicalType::DECIMAL), - "DATE" => Ok(LogicalType::DATE), - "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS), - "TIME_MICROS" => Ok(LogicalType::TIME_MICROS), - "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS), - "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS), - "UINT_8" => Ok(LogicalType::UINT_8), - "UINT_16" => Ok(LogicalType::UINT_16), - "UINT_32" => Ok(LogicalType::UINT_32), - "UINT_64" => Ok(LogicalType::UINT_64), - "INT_8" => Ok(LogicalType::INT_8), - "INT_16" => Ok(LogicalType::INT_16), - "INT_32" => Ok(LogicalType::INT_32), - "INT_64" => Ok(LogicalType::INT_64), - "JSON" => Ok(LogicalType::JSON), - "BSON" => Ok(LogicalType::BSON), - "INTERVAL" => Ok(LogicalType::INTERVAL), + "INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + })), + "INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: true, + })), + "INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: true, + })), + "INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: true, + })), + "INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + })), + "INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false, + })), + "INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false, + })), + "INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false, + })), + "MAP" => Ok(LogicalType::MAP(MapType {})), + "LIST" => Ok(LogicalType::LIST(ListType {})), + "ENUM" => Ok(LogicalType::ENUM(EnumType {})), + // TODO: ARROW-11365 + // "DECIMAL" => Ok(LogicalType::DECIMAL), + "DATE" => Ok(LogicalType::DATE(DateType {})), + "TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "STRING" => Ok(LogicalType::STRING(StringType {})), + "JSON" => Ok(LogicalType::JSON(JsonType {})), + "BSON" => Ok(LogicalType::BSON(BsonType {})), + "UUID" => Ok(LogicalType::UUID(UUIDType {})), + "UNKNOWN" => Ok(LogicalType::UNKNOWN(NullType {})), + "INTERVAL" => Err(general_err!("Interval logical type not yet supported")), other => Err(general_err!("Invalid logical type {}", other)), } } @@ -770,364 +1020,533 @@ mod tests { } #[test] - fn test_display_logical_type() { - assert_eq!(LogicalType::NONE.to_string(), "NONE"); - assert_eq!(LogicalType::UTF8.to_string(), "UTF8"); - assert_eq!(LogicalType::MAP.to_string(), "MAP"); - assert_eq!(LogicalType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); - assert_eq!(LogicalType::LIST.to_string(), "LIST"); - assert_eq!(LogicalType::ENUM.to_string(), "ENUM"); - assert_eq!(LogicalType::DECIMAL.to_string(), "DECIMAL"); - assert_eq!(LogicalType::DATE.to_string(), "DATE"); - assert_eq!(LogicalType::TIME_MILLIS.to_string(), "TIME_MILLIS"); - assert_eq!(LogicalType::DATE.to_string(), "DATE"); - assert_eq!(LogicalType::TIME_MICROS.to_string(), "TIME_MICROS"); - assert_eq!( - LogicalType::TIMESTAMP_MILLIS.to_string(), + fn test_display_converted_type() { + assert_eq!(ConvertedType::NONE.to_string(), "NONE"); + assert_eq!(ConvertedType::UTF8.to_string(), "UTF8"); + assert_eq!(ConvertedType::MAP.to_string(), "MAP"); + assert_eq!(ConvertedType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); + assert_eq!(ConvertedType::LIST.to_string(), "LIST"); + assert_eq!(ConvertedType::ENUM.to_string(), "ENUM"); + assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL"); + assert_eq!(ConvertedType::DATE.to_string(), "DATE"); + assert_eq!(ConvertedType::TIME_MILLIS.to_string(), "TIME_MILLIS"); + assert_eq!(ConvertedType::DATE.to_string(), "DATE"); + assert_eq!(ConvertedType::TIME_MICROS.to_string(), "TIME_MICROS"); + assert_eq!( + ConvertedType::TIMESTAMP_MILLIS.to_string(), "TIMESTAMP_MILLIS" ); assert_eq!( - LogicalType::TIMESTAMP_MICROS.to_string(), + ConvertedType::TIMESTAMP_MICROS.to_string(), "TIMESTAMP_MICROS" ); - assert_eq!(LogicalType::UINT_8.to_string(), "UINT_8"); - assert_eq!(LogicalType::UINT_16.to_string(), "UINT_16"); - assert_eq!(LogicalType::UINT_32.to_string(), "UINT_32"); - assert_eq!(LogicalType::UINT_64.to_string(), "UINT_64"); - assert_eq!(LogicalType::INT_8.to_string(), "INT_8"); - assert_eq!(LogicalType::INT_16.to_string(), "INT_16"); - assert_eq!(LogicalType::INT_32.to_string(), "INT_32"); - assert_eq!(LogicalType::INT_64.to_string(), "INT_64"); - assert_eq!(LogicalType::JSON.to_string(), "JSON"); - assert_eq!(LogicalType::BSON.to_string(), "BSON"); - assert_eq!(LogicalType::INTERVAL.to_string(), "INTERVAL"); + assert_eq!(ConvertedType::UINT_8.to_string(), "UINT_8"); + assert_eq!(ConvertedType::UINT_16.to_string(), "UINT_16"); + assert_eq!(ConvertedType::UINT_32.to_string(), "UINT_32"); + assert_eq!(ConvertedType::UINT_64.to_string(), "UINT_64"); + assert_eq!(ConvertedType::INT_8.to_string(), "INT_8"); + assert_eq!(ConvertedType::INT_16.to_string(), "INT_16"); + assert_eq!(ConvertedType::INT_32.to_string(), "INT_32"); + assert_eq!(ConvertedType::INT_64.to_string(), "INT_64"); + assert_eq!(ConvertedType::JSON.to_string(), "JSON"); + assert_eq!(ConvertedType::BSON.to_string(), "BSON"); + assert_eq!(ConvertedType::INTERVAL.to_string(), "INTERVAL"); } #[test] - fn test_from_logical_type() { - assert_eq!(LogicalType::from(None), LogicalType::NONE); + fn test_from_converted_type() { + let parquet_conv_none: Option = None; + assert_eq!(ConvertedType::from(parquet_conv_none), ConvertedType::NONE); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Utf8)), - LogicalType::UTF8 + ConvertedType::from(Some(parquet::ConvertedType::Utf8)), + ConvertedType::UTF8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Map)), - LogicalType::MAP + ConvertedType::from(Some(parquet::ConvertedType::Map)), + ConvertedType::MAP ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::MapKeyValue)), - LogicalType::MAP_KEY_VALUE + ConvertedType::from(Some(parquet::ConvertedType::MapKeyValue)), + ConvertedType::MAP_KEY_VALUE ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::List)), - LogicalType::LIST + ConvertedType::from(Some(parquet::ConvertedType::List)), + ConvertedType::LIST ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Enum)), - LogicalType::ENUM + ConvertedType::from(Some(parquet::ConvertedType::Enum)), + ConvertedType::ENUM ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Decimal)), - LogicalType::DECIMAL + ConvertedType::from(Some(parquet::ConvertedType::Decimal)), + ConvertedType::DECIMAL ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Date)), - LogicalType::DATE + ConvertedType::from(Some(parquet::ConvertedType::Date)), + ConvertedType::DATE ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimeMillis)), - LogicalType::TIME_MILLIS + ConvertedType::from(Some(parquet::ConvertedType::TimeMillis)), + ConvertedType::TIME_MILLIS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimeMicros)), - LogicalType::TIME_MICROS + ConvertedType::from(Some(parquet::ConvertedType::TimeMicros)), + ConvertedType::TIME_MICROS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimestampMillis)), - LogicalType::TIMESTAMP_MILLIS + ConvertedType::from(Some(parquet::ConvertedType::TimestampMillis)), + ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimestampMicros)), - LogicalType::TIMESTAMP_MICROS + ConvertedType::from(Some(parquet::ConvertedType::TimestampMicros)), + ConvertedType::TIMESTAMP_MICROS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint8)), - LogicalType::UINT_8 + ConvertedType::from(Some(parquet::ConvertedType::Uint8)), + ConvertedType::UINT_8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint16)), - LogicalType::UINT_16 + ConvertedType::from(Some(parquet::ConvertedType::Uint16)), + ConvertedType::UINT_16 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint32)), - LogicalType::UINT_32 + ConvertedType::from(Some(parquet::ConvertedType::Uint32)), + ConvertedType::UINT_32 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint64)), - LogicalType::UINT_64 + ConvertedType::from(Some(parquet::ConvertedType::Uint64)), + ConvertedType::UINT_64 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int8)), - LogicalType::INT_8 + ConvertedType::from(Some(parquet::ConvertedType::Int8)), + ConvertedType::INT_8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int16)), - LogicalType::INT_16 + ConvertedType::from(Some(parquet::ConvertedType::Int16)), + ConvertedType::INT_16 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int32)), - LogicalType::INT_32 + ConvertedType::from(Some(parquet::ConvertedType::Int32)), + ConvertedType::INT_32 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int64)), - LogicalType::INT_64 + ConvertedType::from(Some(parquet::ConvertedType::Int64)), + ConvertedType::INT_64 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Json)), - LogicalType::JSON + ConvertedType::from(Some(parquet::ConvertedType::Json)), + ConvertedType::JSON ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Bson)), - LogicalType::BSON + ConvertedType::from(Some(parquet::ConvertedType::Bson)), + ConvertedType::BSON ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Interval)), - LogicalType::INTERVAL + ConvertedType::from(Some(parquet::ConvertedType::Interval)), + ConvertedType::INTERVAL ); } #[test] - fn test_into_logical_type() { + fn test_into_converted_type() { let converted_type: Option = None; - assert_eq!(converted_type, LogicalType::NONE.into()); - assert_eq!(Some(parquet::ConvertedType::Utf8), LogicalType::UTF8.into()); - assert_eq!(Some(parquet::ConvertedType::Map), LogicalType::MAP.into()); + assert_eq!(converted_type, ConvertedType::NONE.into()); + assert_eq!( + Some(parquet::ConvertedType::Utf8), + ConvertedType::UTF8.into() + ); + assert_eq!(Some(parquet::ConvertedType::Map), ConvertedType::MAP.into()); assert_eq!( Some(parquet::ConvertedType::MapKeyValue), - LogicalType::MAP_KEY_VALUE.into() + ConvertedType::MAP_KEY_VALUE.into() + ); + assert_eq!( + Some(parquet::ConvertedType::List), + ConvertedType::LIST.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Enum), + ConvertedType::ENUM.into() ); - assert_eq!(Some(parquet::ConvertedType::List), LogicalType::LIST.into()); - assert_eq!(Some(parquet::ConvertedType::Enum), LogicalType::ENUM.into()); assert_eq!( Some(parquet::ConvertedType::Decimal), - LogicalType::DECIMAL.into() + ConvertedType::DECIMAL.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Date), + ConvertedType::DATE.into() ); - assert_eq!(Some(parquet::ConvertedType::Date), LogicalType::DATE.into()); assert_eq!( Some(parquet::ConvertedType::TimeMillis), - LogicalType::TIME_MILLIS.into() + ConvertedType::TIME_MILLIS.into() ); assert_eq!( Some(parquet::ConvertedType::TimeMicros), - LogicalType::TIME_MICROS.into() + ConvertedType::TIME_MICROS.into() ); assert_eq!( Some(parquet::ConvertedType::TimestampMillis), - LogicalType::TIMESTAMP_MILLIS.into() + ConvertedType::TIMESTAMP_MILLIS.into() ); assert_eq!( Some(parquet::ConvertedType::TimestampMicros), - LogicalType::TIMESTAMP_MICROS.into() + ConvertedType::TIMESTAMP_MICROS.into() ); assert_eq!( Some(parquet::ConvertedType::Uint8), - LogicalType::UINT_8.into() + ConvertedType::UINT_8.into() ); assert_eq!( Some(parquet::ConvertedType::Uint16), - LogicalType::UINT_16.into() + ConvertedType::UINT_16.into() ); assert_eq!( Some(parquet::ConvertedType::Uint32), - LogicalType::UINT_32.into() + ConvertedType::UINT_32.into() ); assert_eq!( Some(parquet::ConvertedType::Uint64), - LogicalType::UINT_64.into() + ConvertedType::UINT_64.into() ); assert_eq!( Some(parquet::ConvertedType::Int8), - LogicalType::INT_8.into() + ConvertedType::INT_8.into() ); assert_eq!( Some(parquet::ConvertedType::Int16), - LogicalType::INT_16.into() + ConvertedType::INT_16.into() ); assert_eq!( Some(parquet::ConvertedType::Int32), - LogicalType::INT_32.into() + ConvertedType::INT_32.into() ); assert_eq!( Some(parquet::ConvertedType::Int64), - LogicalType::INT_64.into() + ConvertedType::INT_64.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Json), + ConvertedType::JSON.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Bson), + ConvertedType::BSON.into() ); - assert_eq!(Some(parquet::ConvertedType::Json), LogicalType::JSON.into()); - assert_eq!(Some(parquet::ConvertedType::Bson), LogicalType::BSON.into()); assert_eq!( Some(parquet::ConvertedType::Interval), - LogicalType::INTERVAL.into() + ConvertedType::INTERVAL.into() ); } #[test] - fn test_from_string_into_logical_type() { + fn test_from_string_into_converted_type() { assert_eq!( - LogicalType::NONE + ConvertedType::NONE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::NONE + ConvertedType::NONE ); assert_eq!( - LogicalType::UTF8 + ConvertedType::UTF8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UTF8 + ConvertedType::UTF8 ); assert_eq!( - LogicalType::MAP.to_string().parse::().unwrap(), - LogicalType::MAP + ConvertedType::MAP + .to_string() + .parse::() + .unwrap(), + ConvertedType::MAP ); assert_eq!( - LogicalType::MAP_KEY_VALUE + ConvertedType::MAP_KEY_VALUE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::MAP_KEY_VALUE + ConvertedType::MAP_KEY_VALUE ); assert_eq!( - LogicalType::LIST + ConvertedType::LIST .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::LIST + ConvertedType::LIST ); assert_eq!( - LogicalType::ENUM + ConvertedType::ENUM .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::ENUM + ConvertedType::ENUM ); assert_eq!( - LogicalType::DECIMAL + ConvertedType::DECIMAL .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::DECIMAL + ConvertedType::DECIMAL ); assert_eq!( - LogicalType::DATE + ConvertedType::DATE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::DATE + ConvertedType::DATE ); assert_eq!( - LogicalType::TIME_MILLIS + ConvertedType::TIME_MILLIS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIME_MILLIS + ConvertedType::TIME_MILLIS ); assert_eq!( - LogicalType::TIME_MICROS + ConvertedType::TIME_MICROS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIME_MICROS + ConvertedType::TIME_MICROS ); assert_eq!( - LogicalType::TIMESTAMP_MILLIS + ConvertedType::TIMESTAMP_MILLIS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIMESTAMP_MILLIS + ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - LogicalType::TIMESTAMP_MICROS + ConvertedType::TIMESTAMP_MICROS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIMESTAMP_MICROS + ConvertedType::TIMESTAMP_MICROS ); assert_eq!( - LogicalType::UINT_8 + ConvertedType::UINT_8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_8 + ConvertedType::UINT_8 ); assert_eq!( - LogicalType::UINT_16 + ConvertedType::UINT_16 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_16 + ConvertedType::UINT_16 ); assert_eq!( - LogicalType::UINT_32 + ConvertedType::UINT_32 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_32 + ConvertedType::UINT_32 ); assert_eq!( - LogicalType::UINT_64 + ConvertedType::UINT_64 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_64 + ConvertedType::UINT_64 ); assert_eq!( - LogicalType::INT_8 + ConvertedType::INT_8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_8 + ConvertedType::INT_8 ); assert_eq!( - LogicalType::INT_16 + ConvertedType::INT_16 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_16 + ConvertedType::INT_16 ); assert_eq!( - LogicalType::INT_32 + ConvertedType::INT_32 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_32 + ConvertedType::INT_32 ); assert_eq!( - LogicalType::INT_64 + ConvertedType::INT_64 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_64 + ConvertedType::INT_64 ); assert_eq!( - LogicalType::JSON + ConvertedType::JSON .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::JSON + ConvertedType::JSON ); assert_eq!( - LogicalType::BSON + ConvertedType::BSON .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::BSON + ConvertedType::BSON ); assert_eq!( - LogicalType::INTERVAL + ConvertedType::INTERVAL .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INTERVAL + ConvertedType::INTERVAL + ); + } + + #[test] + fn test_logical_to_converted_type() { + let logical_none: Option = None; + assert_eq!(ConvertedType::from(logical_none), ConvertedType::NONE); + assert_eq!( + ConvertedType::from(Some(LogicalType::DECIMAL(DecimalType { + precision: 20, + scale: 5 + }))), + ConvertedType::DECIMAL + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::BSON(Default::default()))), + ConvertedType::BSON + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::JSON(Default::default()))), + ConvertedType::JSON + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::STRING(Default::default()))), + ConvertedType::UTF8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::DATE(Default::default()))), + ConvertedType::DATE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: TimeUnit::MILLIS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIME_MILLIS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: TimeUnit::MICROS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIME_MICROS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: TimeUnit::NANOS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: TimeUnit::MILLIS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIMESTAMP_MILLIS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: TimeUnit::MICROS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::TIMESTAMP_MICROS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: TimeUnit::NANOS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false + }))), + ConvertedType::UINT_8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true + }))), + ConvertedType::INT_8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false + }))), + ConvertedType::UINT_16 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: true + }))), + ConvertedType::INT_16 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false + }))), + ConvertedType::UINT_32 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: true + }))), + ConvertedType::INT_32 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false + }))), + ConvertedType::UINT_64 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: true + }))), + ConvertedType::INT_64 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::LIST(Default::default()))), + ConvertedType::LIST + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::MAP(Default::default()))), + ConvertedType::MAP + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::UUID(Default::default()))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::ENUM(Default::default()))), + ConvertedType::ENUM + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::UNKNOWN(Default::default()))), + ConvertedType::NONE ); } @@ -1389,13 +1808,114 @@ mod tests { } #[test] - fn test_column_order_get_sort_order() { + fn test_column_order_get_logical_type_sort_order() { // Helper to check the order in a list of values. // Only logical type is checked. fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( - ColumnOrder::get_sort_order(tpe, Type::BYTE_ARRAY), + ColumnOrder::get_sort_order( + Some(tpe), + ConvertedType::NONE, + Type::BYTE_ARRAY + ), + expected_order + ); + } + } + + // Unsigned comparison (physical type does not matter) + let unsigned = vec![ + LogicalType::STRING(Default::default()), + LogicalType::JSON(Default::default()), + LogicalType::BSON(Default::default()), + LogicalType::ENUM(Default::default()), + LogicalType::UUID(Default::default()), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false, + }), + ]; + check_sort_order(unsigned, SortOrder::UNSIGNED); + + // Signed comparison (physical type does not matter) + let signed = vec![ + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::DECIMAL(DecimalType { + scale: 20, + precision: 4, + }), + LogicalType::DATE(Default::default()), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(Default::default()), + }), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(Default::default()), + }), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::NANOS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::NANOS(Default::default()), + }), + ]; + check_sort_order(signed, SortOrder::SIGNED); + + // Undefined comparison + let undefined = vec![ + LogicalType::LIST(Default::default()), + LogicalType::MAP(Default::default()), + ]; + check_sort_order(undefined, SortOrder::UNDEFINED); + } + + #[test] + fn test_column_order_get_coverted_type_sort_order() { + // Helper to check the order in a list of values. + // Only converted type is checked. + fn check_sort_order(types: Vec, expected_order: SortOrder) { + for tpe in types { + assert_eq!( + ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY), expected_order ); } @@ -1403,44 +1923,44 @@ mod tests { // Unsigned comparison (physical type does not matter) let unsigned = vec![ - LogicalType::UTF8, - LogicalType::JSON, - LogicalType::BSON, - LogicalType::ENUM, - LogicalType::UINT_8, - LogicalType::UINT_16, - LogicalType::UINT_32, - LogicalType::UINT_64, - LogicalType::INTERVAL, + ConvertedType::UTF8, + ConvertedType::JSON, + ConvertedType::BSON, + ConvertedType::ENUM, + ConvertedType::UINT_8, + ConvertedType::UINT_16, + ConvertedType::UINT_32, + ConvertedType::UINT_64, ]; check_sort_order(unsigned, SortOrder::UNSIGNED); // Signed comparison (physical type does not matter) let signed = vec![ - LogicalType::INT_8, - LogicalType::INT_16, - LogicalType::INT_32, - LogicalType::INT_64, - LogicalType::DECIMAL, - LogicalType::DATE, - LogicalType::TIME_MILLIS, - LogicalType::TIME_MICROS, - LogicalType::TIMESTAMP_MILLIS, - LogicalType::TIMESTAMP_MICROS, + ConvertedType::INT_8, + ConvertedType::INT_16, + ConvertedType::INT_32, + ConvertedType::INT_64, + ConvertedType::DECIMAL, + ConvertedType::DATE, + ConvertedType::TIME_MILLIS, + ConvertedType::TIME_MICROS, + ConvertedType::TIMESTAMP_MILLIS, + ConvertedType::TIMESTAMP_MICROS, ]; check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison let undefined = vec![ - LogicalType::LIST, - LogicalType::MAP, - LogicalType::MAP_KEY_VALUE, + ConvertedType::LIST, + ConvertedType::MAP, + ConvertedType::MAP_KEY_VALUE, + ConvertedType::INTERVAL, ]; check_sort_order(undefined, SortOrder::UNDEFINED); // Check None logical type // This should return a sort order for byte array type. - check_sort_order(vec![LogicalType::NONE], SortOrder::UNSIGNED); + check_sort_order(vec![ConvertedType::NONE], SortOrder::UNSIGNED); } #[test] diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 91f199bae37..d8c2e7a8ebd 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -1024,7 +1024,7 @@ mod tests { fn get_test_int32_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_length(-1) .build() .expect("build() should be OK") @@ -1034,7 +1034,7 @@ mod tests { fn get_test_int64_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .with_length(-1) .build() .expect("build() should be OK") diff --git a/rust/parquet/src/file/footer.rs b/rust/parquet/src/file/footer.rs index e83d8b2e036..2e572944868 100644 --- a/rust/parquet/src/file/footer.rs +++ b/rust/parquet/src/file/footer.rs @@ -140,6 +140,7 @@ fn parse_column_orders( TColumnOrder::TYPEORDER(_) => { let sort_order = ColumnOrder::get_sort_order( column.logical_type(), + column.converted_type(), column.physical_type(), ); res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index 23e39fd855e..265014bf683 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -533,11 +533,11 @@ mod tests { use std::{fs::File, io::Cursor}; - use crate::basic::{Compression, Encoding, Repetition, Type}; + use crate::basic::{Compression, Encoding, IntType, LogicalType, Repetition, Type}; use crate::column::page::PageReader; use crate::compression::{create_codec, Codec}; use crate::file::{ - properties::WriterProperties, + properties::{WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; @@ -722,6 +722,59 @@ mod tests { ); } + #[test] + fn test_file_writer_v2_with_metadata() { + let file = get_temp_file("test_file_writer_v2_write_with_metadata", &[]); + let field_logical_type = Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + })); + let field = Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_logical_type(field_logical_type.clone()) + .with_converted_type(field_logical_type.into()) + .build() + .unwrap(), + ); + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![field.clone()]) + .build() + .unwrap(), + ); + let props = Arc::new( + WriterProperties::builder() + .set_key_value_metadata(Some(vec![KeyValue::new( + "key".to_string(), + "value".to_string(), + )])) + .set_writer_version(WriterVersion::PARQUET_2_0) + .build(), + ); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + + assert_eq!( + reader + .metadata() + .file_metadata() + .key_value_metadata() + .to_owned() + .unwrap() + .len(), + 1 + ); + + // ARROW-11803: Test that the converted and logical types have been populated + let fields = reader.metadata().file_metadata().schema().get_fields(); + assert_eq!(fields.len(), 1); + let read_field = fields.get(0).unwrap(); + assert_eq!(read_field, &field); + } + #[test] fn test_file_writer_empty_row_groups() { let file = get_temp_file("test_file_writer_write_empty_row_groups", &[]); diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs index 9e131b4415c..07f82160db4 100644 --- a/rust/parquet/src/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -22,7 +22,7 @@ use std::fmt; use chrono::{TimeZone, Utc}; use num_bigint::{BigInt, Sign}; -use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; @@ -34,9 +34,9 @@ use serde_json::Value; macro_rules! nyi { ($column_descr:ident, $value:ident) => {{ unimplemented!( - "Conversion for physical type {}, logical type {}, value {:?}", + "Conversion for physical type {}, converted type {}, value {:?}", $column_descr.physical_type(), - $column_descr.logical_type(), + $column_descr.converted_type(), $value ); }}; @@ -562,18 +562,18 @@ impl Field { Field::Bool(value) } - /// Converts Parquet INT32 type with logical type into `i32` value. + /// Converts Parquet INT32 type with converted type into `i32` value. #[inline] pub fn convert_int32(descr: &ColumnDescPtr, value: i32) -> Self { - match descr.logical_type() { - LogicalType::INT_8 => Field::Byte(value as i8), - LogicalType::INT_16 => Field::Short(value as i16), - LogicalType::INT_32 | LogicalType::NONE => Field::Int(value), - LogicalType::UINT_8 => Field::UByte(value as u8), - LogicalType::UINT_16 => Field::UShort(value as u16), - LogicalType::UINT_32 => Field::UInt(value as u32), - LogicalType::DATE => Field::Date(value as u32), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_i32( + match descr.converted_type() { + ConvertedType::INT_8 => Field::Byte(value as i8), + ConvertedType::INT_16 => Field::Short(value as i16), + ConvertedType::INT_32 | ConvertedType::NONE => Field::Int(value), + ConvertedType::UINT_8 => Field::UByte(value as u8), + ConvertedType::UINT_16 => Field::UShort(value as u16), + ConvertedType::UINT_32 => Field::UInt(value as u32), + ConvertedType::DATE => Field::Date(value as u32), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i32( value, descr.type_precision(), descr.type_scale(), @@ -582,15 +582,15 @@ impl Field { } } - /// Converts Parquet INT64 type with logical type into `i64` value. + /// Converts Parquet INT64 type with converted type into `i64` value. #[inline] pub fn convert_int64(descr: &ColumnDescPtr, value: i64) -> Self { - match descr.logical_type() { - LogicalType::INT_64 | LogicalType::NONE => Field::Long(value), - LogicalType::UINT_64 => Field::ULong(value as u64), - LogicalType::TIMESTAMP_MILLIS => Field::TimestampMillis(value as u64), - LogicalType::TIMESTAMP_MICROS => Field::TimestampMicros(value as u64), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_i64( + match descr.converted_type() { + ConvertedType::INT_64 | ConvertedType::NONE => Field::Long(value), + ConvertedType::UINT_64 => Field::ULong(value as u64), + ConvertedType::TIMESTAMP_MILLIS => Field::TimestampMillis(value as u64), + ConvertedType::TIMESTAMP_MICROS => Field::TimestampMicros(value as u64), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i64( value, descr.type_precision(), descr.type_scale(), @@ -612,37 +612,37 @@ impl Field { Field::Float(value) } - /// Converts Parquet DOUBLE type with logical type into `f64` value. + /// Converts Parquet DOUBLE type with converted type into `f64` value. #[inline] pub fn convert_double(_descr: &ColumnDescPtr, value: f64) -> Self { Field::Double(value) } - /// Converts Parquet BYTE_ARRAY type with logical type into either UTF8 string or + /// Converts Parquet BYTE_ARRAY type with converted type into either UTF8 string or /// array of bytes. #[inline] pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Self { match descr.physical_type() { - PhysicalType::BYTE_ARRAY => match descr.logical_type() { - LogicalType::UTF8 | LogicalType::ENUM | LogicalType::JSON => { + PhysicalType::BYTE_ARRAY => match descr.converted_type() { + ConvertedType::UTF8 | ConvertedType::ENUM | ConvertedType::JSON => { let value = String::from_utf8(value.data().to_vec()).unwrap(); Field::Str(value) } - LogicalType::BSON | LogicalType::NONE => Field::Bytes(value), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + ConvertedType::BSON | ConvertedType::NONE => Field::Bytes(value), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_bytes( value, descr.type_precision(), descr.type_scale(), )), _ => nyi!(descr, value), }, - PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.logical_type() { - LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.converted_type() { + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_bytes( value, descr.type_precision(), descr.type_scale(), )), - LogicalType::NONE => Field::Bytes(value), + ConvertedType::NONE => Field::Bytes(value), _ => nyi!(descr, value), }, _ => nyi!(descr, value), @@ -839,7 +839,7 @@ mod tests { macro_rules! make_column_descr { ($physical_type:expr, $logical_type:expr) => {{ let tpe = PrimitiveTypeBuilder::new("col", $physical_type) - .with_logical_type($logical_type) + .with_converted_type($logical_type) .build() .unwrap(); Arc::new(ColumnDescriptor::new( @@ -851,7 +851,7 @@ mod tests { }}; ($physical_type:expr, $logical_type:expr, $len:expr, $prec:expr, $scale:expr) => {{ let tpe = PrimitiveTypeBuilder::new("col", $physical_type) - .with_logical_type($logical_type) + .with_converted_type($logical_type) .with_length($len) .with_precision($prec) .with_scale($scale) @@ -869,7 +869,7 @@ mod tests { #[test] fn test_row_convert_bool() { // BOOLEAN value does not depend on logical type - let descr = make_column_descr![PhysicalType::BOOLEAN, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::BOOLEAN, ConvertedType::NONE]; let row = Field::convert_bool(&descr, true); assert_eq!(row, Field::Bool(true)); @@ -880,70 +880,70 @@ mod tests { #[test] fn test_row_convert_int32() { - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_8]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_8]; let row = Field::convert_int32(&descr, 111); assert_eq!(row, Field::Byte(111)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_16]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_16]; let row = Field::convert_int32(&descr, 222); assert_eq!(row, Field::Short(222)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_32]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_32]; let row = Field::convert_int32(&descr, 333); assert_eq!(row, Field::Int(333)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_8]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_8]; let row = Field::convert_int32(&descr, -1); assert_eq!(row, Field::UByte(255)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_16]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_16]; let row = Field::convert_int32(&descr, 256); assert_eq!(row, Field::UShort(256)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_32]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_32]; let row = Field::convert_int32(&descr, 1234); assert_eq!(row, Field::UInt(1234)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::NONE]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Int(444)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::DATE]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::DATE]; let row = Field::convert_int32(&descr, 14611); assert_eq!(row, Field::Date(14611)); let descr = - make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::INT32, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); } #[test] fn test_row_convert_int64() { - let descr = make_column_descr![PhysicalType::INT64, LogicalType::INT_64]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::INT_64]; let row = Field::convert_int64(&descr, 1111); assert_eq!(row, Field::Long(1111)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::UINT_64]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::UINT_64]; let row = Field::convert_int64(&descr, 78239823); assert_eq!(row, Field::ULong(78239823)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; + make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MILLIS]; let row = Field::convert_int64(&descr, 1541186529153); assert_eq!(row, Field::TimestampMillis(1541186529153)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MICROS]; + make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MICROS]; let row = Field::convert_int64(&descr, 1541186529153123); assert_eq!(row, Field::TimestampMicros(1541186529153123)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::NONE]; let row = Field::convert_int64(&descr, 2222); assert_eq!(row, Field::Long(2222)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::INT64, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int64(&descr, 3333); assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); } @@ -951,7 +951,7 @@ mod tests { #[test] fn test_row_convert_int96() { // INT96 value does not depend on logical type - let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT96, ConvertedType::NONE]; let value = Int96::from(vec![0, 0, 2454923]); let row = Field::convert_int96(&descr, value); @@ -965,7 +965,7 @@ mod tests { #[test] fn test_row_convert_float() { // FLOAT value does not depend on logical type - let descr = make_column_descr![PhysicalType::FLOAT, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::FLOAT, ConvertedType::NONE]; let row = Field::convert_float(&descr, 2.31); assert_eq!(row, Field::Float(2.31)); } @@ -973,7 +973,7 @@ mod tests { #[test] fn test_row_convert_double() { // DOUBLE value does not depend on logical type - let descr = make_column_descr![PhysicalType::DOUBLE, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::DOUBLE, ConvertedType::NONE]; let row = Field::convert_double(&descr, 1.56); assert_eq!(row, Field::Double(1.56)); } @@ -981,38 +981,38 @@ mod tests { #[test] fn test_row_convert_byte_array() { // UTF8 - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::UTF8]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::UTF8]; let value = ByteArray::from(vec![b'A', b'B', b'C', b'D']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("ABCD".to_string())); // ENUM - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::ENUM]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::ENUM]; let value = ByteArray::from(vec![b'1', b'2', b'3']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("123".to_string())); // JSON - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::JSON]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::JSON]; let value = ByteArray::from(vec![b'{', b'"', b'a', b'"', b':', b'1', b'}']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("{\"a\":1}".to_string())); // NONE - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::NONE]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Bytes(value)); // BSON - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::BSON]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::BSON]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Bytes(value)); // DECIMAL let descr = - make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::DECIMAL, 0, 8, 2]; let value = ByteArray::from(vec![207, 200]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); @@ -1020,7 +1020,7 @@ mod tests { // DECIMAL (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ PhysicalType::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, + ConvertedType::DECIMAL, 8, 17, 5 @@ -1032,7 +1032,7 @@ mod tests { // NONE (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ PhysicalType::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, + ConvertedType::NONE, 6, 0, 0 diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs index 5f42d37bac0..2323cd17b71 100644 --- a/rust/parquet/src/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -20,7 +20,7 @@ use std::{collections::HashMap, fmt, sync::Arc}; -use crate::basic::{LogicalType, Repetition}; +use crate::basic::{ConvertedType, Repetition}; use crate::errors::{ParquetError, Result}; use crate::file::reader::{FileReader, RowGroupReader}; use crate::record::{ @@ -138,9 +138,9 @@ impl TreeBuilder { let column = TripletIter::new(col_descr, col_reader, self.batch_size); Reader::PrimitiveReader(field, column) } else { - match field.get_basic_info().logical_type() { + match field.get_basic_info().converted_type() { // List types - LogicalType::LIST => { + ConvertedType::LIST => { assert_eq!( field.get_fields().len(), 1, @@ -198,7 +198,7 @@ impl TreeBuilder { } } // Map types (key-value pairs) - LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { assert_eq!( field.get_fields().len(), 1, @@ -269,7 +269,7 @@ impl TreeBuilder { _ if repetition == Repetition::REPEATED => { let required_field = Type::group_type_builder(field.name()) .with_repetition(Repetition::REQUIRED) - .with_logical_type(field.get_basic_info().logical_type()) + .with_converted_type(field.get_basic_info().converted_type()) .with_fields(&mut Vec::from(field.get_fields())) .build() .unwrap(); diff --git a/rust/parquet/src/schema/mod.rs b/rust/parquet/src/schema/mod.rs index 47ba3ca8b78..1ebee2e06e8 100644 --- a/rust/parquet/src/schema/mod.rs +++ b/rust/parquet/src/schema/mod.rs @@ -21,7 +21,7 @@ //! //! ```rust //! use parquet::{ -//! basic::{LogicalType, Repetition, Type as PhysicalType}, +//! basic::{ConvertedType, Repetition, Type as PhysicalType}, //! schema::{parser, printer, types::Type}, //! }; //! use std::sync::Arc; @@ -34,7 +34,7 @@ //! // } //! //! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) -//! .with_logical_type(LogicalType::UTF8) +//! .with_converted_type(ConvertedType::UTF8) //! .with_repetition(Repetition::OPTIONAL) //! .build() //! .unwrap(); diff --git a/rust/parquet/src/schema/parser.rs b/rust/parquet/src/schema/parser.rs index 9f14a550241..50f00bb5534 100644 --- a/rust/parquet/src/schema/parser.rs +++ b/rust/parquet/src/schema/parser.rs @@ -44,7 +44,7 @@ use std::sync::Arc; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; use crate::schema::types::{Type, TypePtr}; @@ -223,17 +223,17 @@ impl<'a> Parser<'a> { .ok_or_else(|| general_err!("Expected name, found None"))?; // Parse logical type if exists - let logical_type = if let Some("(") = self.tokenizer.next() { + let converted_type = if let Some("(") = self.tokenizer.next() { let tpe = self .tokenizer .next() .ok_or_else(|| general_err!("Expected logical type, found None")) - .and_then(|v| v.to_uppercase().parse::())?; + .and_then(|v| v.to_uppercase().parse::())?; assert_token(self.tokenizer.next(), ")")?; tpe } else { self.tokenizer.backtrack(); - LogicalType::NONE + ConvertedType::NONE }; // Parse optional id @@ -246,7 +246,7 @@ impl<'a> Parser<'a> { let mut fields = self.parse_child_types()?; let mut builder = Type::group_type_builder(name) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_fields(&mut fields); if let Some(rep) = repetition { builder = builder.with_repetition(rep); @@ -281,18 +281,19 @@ impl<'a> Parser<'a> { .ok_or_else(|| general_err!("Expected name, found None"))?; // Parse logical type - let (logical_type, precision, scale) = if let Some("(") = self.tokenizer.next() { + let (converted_type, precision, scale) = if let Some("(") = self.tokenizer.next() + { let tpe = self .tokenizer .next() .ok_or_else(|| general_err!("Expected logical type, found None")) - .and_then(|v| v.to_uppercase().parse::())?; + .and_then(|v| v.to_uppercase().parse::())?; // Parse precision and scale for decimals let mut precision: i32 = -1; let mut scale: i32 = -1; - if tpe == LogicalType::DECIMAL { + if tpe == ConvertedType::DECIMAL { if let Some("(") = self.tokenizer.next() { // Parse precision precision = parse_i32( @@ -324,7 +325,7 @@ impl<'a> Parser<'a> { (tpe, precision, scale) } else { self.tokenizer.backtrack(); - (LogicalType::NONE, -1, -1) + (ConvertedType::NONE, -1, -1) }; // Parse optional id @@ -338,7 +339,7 @@ impl<'a> Parser<'a> { let mut builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_length(length) .with_precision(precision) .with_scale(scale); @@ -597,7 +598,7 @@ mod tests { "f1", PhysicalType::FIXED_LEN_BYTE_ARRAY, ) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(5) .with_precision(9) .with_scale(3) @@ -609,7 +610,7 @@ mod tests { "f2", PhysicalType::FIXED_LEN_BYTE_ARRAY, ) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(16) .with_precision(38) .with_scale(18) @@ -656,14 +657,14 @@ mod tests { Arc::new( Type::group_type_builder("a1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new( Type::primitive_type_builder( "a2", PhysicalType::BYTE_ARRAY, ) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(), )]) @@ -673,7 +674,7 @@ mod tests { Arc::new( Type::group_type_builder("b1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new( Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) @@ -734,14 +735,14 @@ mod tests { Arc::new( Type::primitive_type_builder("_1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_8) + .with_converted_type(ConvertedType::INT_8) .build() .unwrap(), ), Arc::new( Type::primitive_type_builder("_2", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_16) + .with_converted_type(ConvertedType::INT_16) .build() .unwrap(), ), @@ -759,13 +760,13 @@ mod tests { ), Arc::new( Type::primitive_type_builder("_5", PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .build() .unwrap(), ), Arc::new( Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(), ), diff --git a/rust/parquet/src/schema/printer.rs b/rust/parquet/src/schema/printer.rs index 235dd564016..81ada8f6f99 100644 --- a/rust/parquet/src/schema/printer.rs +++ b/rust/parquet/src/schema/printer.rs @@ -45,7 +45,7 @@ use std::{fmt, io}; -use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, }; @@ -215,9 +215,9 @@ impl<'a> Printer<'a> { _ => format!("{}", physical_type), }; // Also print logical type if it is available - let logical_type_str = match basic_info.logical_type() { - LogicalType::NONE => format!(""), - decimal @ LogicalType::DECIMAL => { + let converted_type_str = match basic_info.converted_type() { + ConvertedType::NONE => format!(""), + decimal @ ConvertedType::DECIMAL => { // For decimal type we should print precision and scale if they // are > 0, e.g. DECIMAL(9, 2) - // DECIMAL(9) - DECIMAL @@ -228,7 +228,7 @@ impl<'a> Printer<'a> { }; format!(" ({}{})", decimal, precision_scale) } - other_logical_type => format!(" ({})", other_logical_type), + other_converted_type => format!(" ({})", other_converted_type), }; write!( self.output, @@ -236,7 +236,7 @@ impl<'a> Printer<'a> { basic_info.repetition(), phys_type_str, basic_info.name(), - logical_type_str + converted_type_str ); } Type::GroupType { @@ -246,8 +246,8 @@ impl<'a> Printer<'a> { if basic_info.has_repetition() { let r = basic_info.repetition(); write!(self.output, "{} group {} ", r, basic_info.name()); - if basic_info.logical_type() != LogicalType::NONE { - write!(self.output, "({}) ", basic_info.logical_type()); + if basic_info.converted_type() != ConvertedType::NONE { + write!(self.output, "({}) ", basic_info.converted_type()); } writeln!(self.output, "{{"); } else { @@ -293,7 +293,7 @@ mod tests { let mut p = Printer::new(&mut s); let field = Type::primitive_type_builder("field", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build() .unwrap(); p.print(&field); @@ -322,17 +322,17 @@ mod tests { let mut p = Printer::new(&mut s); let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_id(1) .build(); let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(12) .with_id(2) .build(); @@ -369,13 +369,13 @@ mod tests { fn test_print_and_parse_primitive() { let a2 = Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(); let a1 = Type::group_type_builder("a1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(a2)]) .build() .unwrap(); @@ -392,14 +392,14 @@ mod tests { let b2 = Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::NONE) + .with_converted_type(ConvertedType::NONE) .with_fields(&mut vec![Arc::new(b3), Arc::new(b4)]) .build() .unwrap(); let b1 = Type::group_type_builder("b1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(b2)]) .build() .unwrap(); @@ -422,13 +422,13 @@ mod tests { fn test_print_and_parse_nested() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build() .unwrap(); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(); @@ -440,7 +440,7 @@ mod tests { let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(12) .build() .unwrap(); @@ -457,7 +457,7 @@ mod tests { fn test_print_and_parse_decimal() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(9) .with_scale(2) .build() @@ -465,7 +465,7 @@ mod tests { let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(9) .with_scale(0) .build() diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index 5c35e1cde2c..d80fe0d011f 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -21,7 +21,7 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; use parquet_format::SchemaElement; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, LogicalType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; // ---------------------------------------------------------------------- @@ -192,7 +192,8 @@ pub struct PrimitiveTypeBuilder<'a> { name: &'a str, repetition: Repetition, physical_type: PhysicalType, - logical_type: LogicalType, + converted_type: ConvertedType, + logical_type: Option, length: i32, precision: i32, scale: i32, @@ -206,7 +207,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { name, repetition: Repetition::OPTIONAL, physical_type, - logical_type: LogicalType::NONE, + converted_type: ConvertedType::NONE, + logical_type: None, length: -1, precision: -1, scale: -1, @@ -220,8 +222,14 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } + /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { + self.converted_type = converted_type; + self + } + /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. - pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + pub fn with_logical_type(mut self, logical_type: Option) -> Self { self.logical_type = logical_type; self } @@ -261,6 +269,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { let basic_info = BasicTypeInfo { name: String::from(self.name), repetition: Some(self.repetition), + converted_type: self.converted_type, logical_type: self.logical_type, id: self.id, }; @@ -273,17 +282,17 @@ impl<'a> PrimitiveTypeBuilder<'a> { )); } - match self.logical_type { - LogicalType::NONE => {} - LogicalType::UTF8 | LogicalType::BSON | LogicalType::JSON => { + match self.converted_type { + ConvertedType::NONE => {} + ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => { if self.physical_type != PhysicalType::BYTE_ARRAY { return Err(general_err!( "{} can only annotate BYTE_ARRAY fields", - self.logical_type + self.converted_type )); } } - LogicalType::DECIMAL => { + ConvertedType::DECIMAL => { match self.physical_type { PhysicalType::INT32 | PhysicalType::INT64 @@ -353,34 +362,34 @@ impl<'a> PrimitiveTypeBuilder<'a> { _ => (), // For BYTE_ARRAY precision is not limited } } - LogicalType::DATE - | LogicalType::TIME_MILLIS - | LogicalType::UINT_8 - | LogicalType::UINT_16 - | LogicalType::UINT_32 - | LogicalType::INT_8 - | LogicalType::INT_16 - | LogicalType::INT_32 => { + ConvertedType::DATE + | ConvertedType::TIME_MILLIS + | ConvertedType::UINT_8 + | ConvertedType::UINT_16 + | ConvertedType::UINT_32 + | ConvertedType::INT_8 + | ConvertedType::INT_16 + | ConvertedType::INT_32 => { if self.physical_type != PhysicalType::INT32 { return Err(general_err!( "{} can only annotate INT32", - self.logical_type + self.converted_type )); } } - LogicalType::TIME_MICROS - | LogicalType::TIMESTAMP_MILLIS - | LogicalType::TIMESTAMP_MICROS - | LogicalType::UINT_64 - | LogicalType::INT_64 => { + ConvertedType::TIME_MICROS + | ConvertedType::TIMESTAMP_MILLIS + | ConvertedType::TIMESTAMP_MICROS + | ConvertedType::UINT_64 + | ConvertedType::INT_64 => { if self.physical_type != PhysicalType::INT64 { return Err(general_err!( "{} can only annotate INT64", - self.logical_type + self.converted_type )); } } - LogicalType::INTERVAL => { + ConvertedType::INTERVAL => { if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { @@ -389,7 +398,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { )); } } - LogicalType::ENUM => { + ConvertedType::ENUM => { if self.physical_type != PhysicalType::BYTE_ARRAY { return Err(general_err!("ENUM can only annotate BYTE_ARRAY fields")); } @@ -397,7 +406,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { _ => { return Err(general_err!( "{} cannot be applied to a primitive type", - self.logical_type + self.converted_type )); } } @@ -418,7 +427,8 @@ impl<'a> PrimitiveTypeBuilder<'a> { pub struct GroupTypeBuilder<'a> { name: &'a str, repetition: Option, - logical_type: LogicalType, + converted_type: ConvertedType, + logical_type: Option, fields: Vec, id: Option, } @@ -429,7 +439,8 @@ impl<'a> GroupTypeBuilder<'a> { Self { name, repetition: None, - logical_type: LogicalType::NONE, + converted_type: ConvertedType::NONE, + logical_type: None, fields: Vec::new(), id: None, } @@ -441,8 +452,14 @@ impl<'a> GroupTypeBuilder<'a> { self } + /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { + self.converted_type = converted_type; + self + } + /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. - pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { + pub fn with_logical_type(mut self, logical_type: Option) -> Self { self.logical_type = logical_type; self } @@ -465,6 +482,7 @@ impl<'a> GroupTypeBuilder<'a> { let basic_info = BasicTypeInfo { name: String::from(self.name), repetition: self.repetition, + converted_type: self.converted_type, logical_type: self.logical_type, id: self.id, }; @@ -481,7 +499,8 @@ impl<'a> GroupTypeBuilder<'a> { pub struct BasicTypeInfo { name: String, repetition: Option, - logical_type: LogicalType, + converted_type: ConvertedType, + logical_type: Option, id: Option, } @@ -504,9 +523,15 @@ impl BasicTypeInfo { self.repetition.unwrap() } + /// Returns [`ConvertedType`](crate::basic::ConvertedType) value for the type. + pub fn converted_type(&self) -> ConvertedType { + self.converted_type + } + /// Returns [`LogicalType`](crate::basic::LogicalType) value for the type. - pub fn logical_type(&self) -> LogicalType { - self.logical_type + pub fn logical_type(&self) -> Option { + // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it + self.logical_type.clone() } /// Returns `true` if id is set, `false` otherwise. @@ -665,8 +690,13 @@ impl ColumnDescriptor { self.primitive_type.name() } + /// Returns [`ConvertedType`](crate::basic::ConvertedType) for this column. + pub fn converted_type(&self) -> ConvertedType { + self.primitive_type.get_basic_info().converted_type() + } + /// Returns [`LogicalType`](crate::basic::LogicalType) for this column. - pub fn logical_type(&self) -> LogicalType { + pub fn logical_type(&self) -> Option { self.primitive_type.get_basic_info().logical_type() } @@ -906,7 +936,14 @@ fn from_thrift_helper( elements.len() )); } - let logical_type = LogicalType::from(elements[index].converted_type); + let element = &elements[index]; + let converted_type = ConvertedType::from(element.converted_type); + // LogicalType is only present in v2 Parquet files. ConvertedType is always + // populated, regardless of the version of the file (v1 or v2). + let logical_type = element + .logical_type + .as_ref() + .map(|value| LogicalType::from(value.clone())); let field_id = elements[index].field_id; match elements[index].num_children { // From parquet-format: @@ -929,6 +966,7 @@ fn from_thrift_helper( let name = &elements[index].name; let mut builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) + .with_converted_type(converted_type) .with_logical_type(logical_type) .with_length(length) .with_precision(precision) @@ -949,6 +987,7 @@ fn from_thrift_helper( } let mut builder = Type::group_type_builder(&elements[index].name) + .with_converted_type(converted_type) .with_logical_type(logical_type) .with_fields(&mut fields); if let Some(rep) = repetition { @@ -1002,7 +1041,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { repetition_type: Some(basic_info.repetition().into()), name: basic_info.name().to_owned(), num_children: None, - converted_type: basic_info.logical_type().into(), + converted_type: basic_info.converted_type().into(), scale: if scale >= 0 { Some(scale) } else { None }, precision: if precision >= 0 { Some(precision) @@ -1014,7 +1053,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { } else { None }, - logical_type: None, + logical_type: basic_info.logical_type().map(|value| value.into()), }; elements.push(element); @@ -1035,7 +1074,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { repetition_type: repetition, name: basic_info.name().to_owned(), num_children: Some(fields.len() as i32), - converted_type: basic_info.logical_type().into(), + converted_type: basic_info.converted_type().into(), scale: None, precision: None, field_id: if basic_info.has_id() { @@ -1043,7 +1082,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { } else { None }, - logical_type: None, + logical_type: basic_info.logical_type().map(|value| value.into()), }; elements.push(element); @@ -1062,10 +1101,12 @@ mod tests { use crate::schema::parser::parse_message_type; + // TODO: add tests for v2 types + #[test] fn test_primitive_type() { let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); assert!(result.is_ok()); @@ -1075,7 +1116,7 @@ mod tests { assert!(!tp.is_group()); let basic_info = tp.get_basic_info(); assert_eq!(basic_info.repetition(), Repetition::OPTIONAL); - assert_eq!(basic_info.logical_type(), LogicalType::INT_32); + assert_eq!(basic_info.converted_type(), ConvertedType::INT_32); assert_eq!(basic_info.id(), 0); match tp { Type::PrimitiveType { physical_type, .. } => { @@ -1088,7 +1129,7 @@ mod tests { // Test illegal inputs result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::BSON) + .with_converted_type(ConvertedType::BSON) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1100,7 +1141,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT96) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(-1) .with_scale(-1) .build(); @@ -1114,7 +1155,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(-1) .with_scale(-1) .build(); @@ -1128,7 +1169,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(0) .with_scale(-1) .build(); @@ -1142,7 +1183,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(1) .with_scale(-1) .build(); @@ -1153,7 +1194,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(1) .with_scale(2) .build(); @@ -1167,7 +1208,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(18) .with_scale(2) .build(); @@ -1181,7 +1222,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(32) .with_scale(2) .build(); @@ -1195,7 +1236,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(5) .with_precision(12) .with_scale(2) @@ -1210,7 +1251,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1222,7 +1263,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::TIME_MICROS) + .with_converted_type(ConvertedType::TIME_MICROS) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1234,7 +1275,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1246,7 +1287,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(1) .build(); assert!(result.is_err()); @@ -1259,7 +1300,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::ENUM) + .with_converted_type(ConvertedType::ENUM) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1271,7 +1312,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::MAP) + .with_converted_type(ConvertedType::MAP) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1283,7 +1324,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(-1) .build(); assert!(result.is_err()); @@ -1298,12 +1339,12 @@ mod tests { #[test] fn test_group_type() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); assert!(f1.is_ok()); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_id(1) .build(); assert!(f2.is_ok()); @@ -1324,7 +1365,7 @@ mod tests { assert!(tp.is_group()); assert!(!tp.is_primitive()); assert_eq!(basic_info.repetition(), Repetition::REPEATED); - assert_eq!(basic_info.logical_type(), LogicalType::NONE); + assert_eq!(basic_info.converted_type(), ConvertedType::NONE); assert_eq!(basic_info.id(), 1); assert_eq!(tp.get_fields().len(), 2); assert_eq!(tp.get_fields()[0].name(), "f1"); @@ -1343,13 +1384,13 @@ mod tests { fn test_column_descriptor_helper() -> Result<()> { let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build()?; let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name")); assert_eq!(descr.path(), &ColumnPath::from("name")); - assert_eq!(descr.logical_type(), LogicalType::UTF8); + assert_eq!(descr.converted_type(), ConvertedType::UTF8); assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY); assert_eq!(descr.max_def_level(), 4); assert_eq!(descr.max_rep_level(), 1); @@ -1377,33 +1418,33 @@ mod tests { let inta = Type::primitive_type_builder("a", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build()?; fields.push(Arc::new(inta)); let intb = Type::primitive_type_builder("b", PhysicalType::INT64) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .build()?; fields.push(Arc::new(intb)); let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build()?; fields.push(Arc::new(intc)); // 3-level list encoding let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .build()?; let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build()?; let list = Type::group_type_builder("records") .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)]) .build()?; let bag = Type::group_type_builder("bag") @@ -1522,11 +1563,11 @@ mod tests { // OK: different logical type does not affect check_contains let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .build() .unwrap(); let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_16) + .with_converted_type(ConvertedType::UINT_16) .build() .unwrap(); assert!(f1.check_contains(&f2)); diff --git a/rust/parquet/src/schema/visitor.rs b/rust/parquet/src/schema/visitor.rs index 04d77599c91..61bc3be951d 100644 --- a/rust/parquet/src/schema/visitor.rs +++ b/rust/parquet/src/schema/visitor.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::basic::{LogicalType, Repetition}; +use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError::General; use crate::errors::Result; use crate::schema::types::{Type, TypePtr}; @@ -100,9 +100,9 @@ pub trait TypeVisitor { if cur_type.is_primitive() { self.visit_primitive(cur_type, context) } else { - match cur_type.get_basic_info().logical_type() { - LogicalType::LIST => self.visit_list(cur_type, context), - LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + match cur_type.get_basic_info().converted_type() { + ConvertedType::LIST => self.visit_list(cur_type, context), + ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { self.visit_map(cur_type, context) } _ => self.visit_struct(cur_type, context),