From 6333d7913d2c2692522353535804174f2526952d Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sat, 19 Oct 2024 10:15:04 -0700 Subject: [PATCH 01/68] preliminary changes --- arrow-arith/src/numeric.rs | 40 ++++ arrow-array/src/array/mod.rs | 2 + arrow-array/src/array/primitive_array.rs | 192 ++++++++++++++++++- arrow-array/src/builder/buffer_builder.rs | 4 + arrow-array/src/builder/primitive_builder.rs | 7 +- arrow-array/src/builder/struct_builder.rs | 6 + arrow-array/src/types.rs | 92 ++++++++- arrow-cast/src/cast/mod.rs | 25 +-- arrow-cast/src/display.rs | 2 +- arrow-data/src/data.rs | 4 +- arrow-data/src/decimal.rs | 159 ++++++++++++++- arrow-data/src/equal/mod.rs | 2 + arrow-data/src/transform/mod.rs | 8 +- arrow-integration-test/src/datatype.rs | 20 +- arrow-integration-test/src/lib.rs | 36 ++++ arrow-ipc/src/convert.rs | 42 ++-- arrow-schema/src/datatype.rs | 52 +++++ arrow-schema/src/datatype_parse.rs | 35 ++++ arrow-schema/src/ffi.rs | 16 +- arrow-schema/src/field.rs | 2 + arrow/benches/array_from_vec.rs | 46 +++++ arrow/benches/builder.rs | 40 +++- arrow/benches/cast_kernels.rs | 34 ++++ arrow/benches/decimal_validate.rs | 57 +++++- arrow/src/tensor.rs | 4 + parquet/src/arrow/arrow_reader/statistics.rs | 98 +++++++++- parquet/src/arrow/arrow_writer/levels.rs | 2 + parquet/src/arrow/arrow_writer/mod.rs | 55 ++++++ parquet/src/arrow/schema/mod.rs | 1 + 29 files changed, 1020 insertions(+), 63 deletions(-) diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index b6af40f7d7c2..7d4a68ca3f2f 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -109,6 +109,20 @@ pub fn neg(array: &dyn Array) -> Result { Float16 => neg_wrapping!(Float16Type, array), Float32 => neg_wrapping!(Float32Type, array), Float64 => neg_wrapping!(Float64Type, array), + Decimal32(p, s) => { + let a = array + .as_primitive::() + .try_unary::<_, Decimal32Type, _>(|x| x.neg_checked())?; + + Ok(Arc::new(a.with_precision_and_scale(*p, *s)?)) + } + Decimal64(p, s) => { + let a = array + .as_primitive::() + .try_unary::<_, Decimal64Type, _>(|x| x.neg_checked())?; + + Ok(Arc::new(a.with_precision_and_scale(*p, *s)?)) + } Decimal128(p, s) => { let a = array .as_primitive::() @@ -232,6 +246,8 @@ fn arithmetic_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result interval_op::(op, l, l_scalar, r, r_scalar), (Date32, _) => date_op::(op, l, l_scalar, r, r_scalar), (Date64, _) => date_op::(op, l, l_scalar, r, r_scalar), + (Decimal32(_, _), Decimal32(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), + (Decimal64(_, _), Decimal64(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), (Decimal128(_, _), Decimal128(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), (Decimal256(_, _), Decimal256(_, _)) => decimal_op::(op, l, l_scalar, r, r_scalar), (l_t, r_t) => match (l_t, r_t) { @@ -726,6 +742,8 @@ fn decimal_op( let r = r.as_primitive::(); let (p1, s1, p2, s2) = match (l.data_type(), r.data_type()) { + (DataType::Decimal32(p1, s1), DataType::Decimal32(p2, s2)) => (p1, s1, p2, s2), + (DataType::Decimal64(p1, s1), DataType::Decimal64(p2, s2)) => (p1, s1, p2, s2), (DataType::Decimal128(p1, s1), DataType::Decimal128(p2, s2)) => (p1, s1, p2, s2), (DataType::Decimal256(p1, s1), DataType::Decimal256(p2, s2)) => (p1, s1, p2, s2), _ => unreachable!(), @@ -914,6 +932,28 @@ mod tests { "Arithmetic overflow: Overflow happened on: - -9223372036854775808" ); + let a = Decimal32Array::from(vec![1, 3, -44, 2, 4]) + .with_precision_and_scale(9, 6) + .unwrap(); + + let r = neg(&a).unwrap(); + assert_eq!(r.data_type(), a.data_type()); + assert_eq!( + r.as_primitive::().values(), + &[-1, -3, 44, -2, -4] + ); + + let a = Decimal64Array::from(vec![1, 3, -44, 2, 4]) + .with_precision_and_scale(9, 6) + .unwrap(); + + let r = neg(&a).unwrap(); + assert_eq!(r.data_type(), a.data_type()); + assert_eq!( + r.as_primitive::().values(), + &[-1, -3, 44, -2, -4] + ); + let a = Decimal128Array::from(vec![1, 3, -44, 2, 4]) .with_precision_and_scale(9, 6) .unwrap(); diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 296f5ae721b3..6e7a6a3626a8 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -713,6 +713,8 @@ pub fn make_array(data: ArrayData) -> ArrayRef { dt => panic!("Unexpected data type for run_ends array {dt:?}"), }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, + DataType::Decimal32(_, _) => Arc::new(Decimal32Array::from(data)) as ArrayRef, + DataType::Decimal64(_, _) => Arc::new(Decimal64Array::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, dt => panic!("Unexpected data type {dt:?}"), diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 567fa00e7385..da40b7775f30 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -410,6 +410,44 @@ pub type DurationMicrosecondArray = PrimitiveArray; /// A [`PrimitiveArray`] of elapsed durations in nanoseconds pub type DurationNanosecondArray = PrimitiveArray; +/// A [`PrimitiveArray`] of 32-bit fixed point decimals +/// +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::Decimal32Array; +/// // Create from Vec> +/// let arr = Decimal32Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Decimal32Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Decimal32Array = std::iter::repeat(42).take(10).collect(); +/// ``` +/// +/// See [`PrimitiveArray`] for more information and examples +pub type Decimal32Array = PrimitiveArray; + +/// A [`PrimitiveArray`] of 64-bit fixed point decimals +/// +/// # Examples +/// +/// Construction +/// +/// ``` +/// # use arrow_array::Decimal64Array; +/// // Create from Vec> +/// let arr = Decimal64Array::from(vec![Some(1), None, Some(2)]); +/// // Create from Vec +/// let arr = Decimal64Array::from(vec![1, 2, 3]); +/// // Create iter/collect +/// let arr: Decimal64Array = std::iter::repeat(42).take(10).collect(); +/// ``` +/// +/// See [`PrimitiveArray`] for more information and examples +pub type Decimal64Array = PrimitiveArray; + /// A [`PrimitiveArray`] of 128-bit fixed point decimals /// /// # Examples @@ -418,7 +456,7 @@ pub type DurationNanosecondArray = PrimitiveArray; /// /// ``` /// # use arrow_array::Decimal128Array; -/// // Create from Vec> +/// // Create from Vec> /// let arr = Decimal128Array::from(vec![Some(1), None, Some(2)]); /// // Create from Vec /// let arr = Decimal128Array::from(vec![1, 2, 3]); @@ -672,6 +710,8 @@ impl PrimitiveArray { DataType::Timestamp(t1, _) => { matches!(data_type, DataType::Timestamp(t2, _) if &t1 == t2) } + DataType::Decimal32(_, _) => matches!(data_type, DataType::Decimal32(_, _)), + DataType::Decimal64(_, _) => matches!(data_type, DataType::Decimal64(_, _)), DataType::Decimal128(_, _) => matches!(data_type, DataType::Decimal128(_, _)), DataType::Decimal256(_, _) => matches!(data_type, DataType::Decimal256(_, _)), _ => T::DATA_TYPE.eq(data_type), @@ -1343,6 +1383,8 @@ def_from_for_primitive!(UInt64Type, u64); def_from_for_primitive!(Float16Type, f16); def_from_for_primitive!(Float32Type, f32); def_from_for_primitive!(Float64Type, f64); +def_from_for_primitive!(Decimal32Type, i32); +def_from_for_primitive!(Decimal64Type, i64); def_from_for_primitive!(Decimal128Type, i128); def_from_for_primitive!(Decimal256Type, i256); @@ -1455,6 +1497,8 @@ def_numeric_from_vec!(UInt64Type); def_numeric_from_vec!(Float16Type); def_numeric_from_vec!(Float32Type); def_numeric_from_vec!(Float64Type); +def_numeric_from_vec!(Decimal32Type); +def_numeric_from_vec!(Decimal64Type); def_numeric_from_vec!(Decimal128Type); def_numeric_from_vec!(Decimal256Type); @@ -1581,6 +1625,26 @@ impl PrimitiveArray { /// Returns the decimal precision of this array pub fn precision(&self) -> u8 { match T::BYTE_LENGTH { + 4 => { + if let DataType::Decimal32(p, _) = self.data_type() { + *p + } else { + unreachable!( + "Decimal32Array datatype is not DataType::Decimal32 but {}", + self.data_type() + ) + } + } + 8 => { + if let DataType::Decimal64(p, _) = self.data_type() { + *p + } else { + unreachable!( + "Decimal64Array datatype is not DataType::Decimal64 but {}", + self.data_type() + ) + } + } 16 => { if let DataType::Decimal128(p, _) = self.data_type() { *p @@ -1608,6 +1672,26 @@ impl PrimitiveArray { /// Returns the decimal scale of this array pub fn scale(&self) -> i8 { match T::BYTE_LENGTH { + 4 => { + if let DataType::Decimal32(_, s) = self.data_type() { + *s + } else { + unreachable!( + "Decimal32Array datatype is not DataType::Decimal32 but {}", + self.data_type() + ) + } + } + 8 => { + if let DataType::Decimal64(_, s) = self.data_type() { + *s + } else { + unreachable!( + "Decimal64Array datatype is not DataType::Decimal64 but {}", + self.data_type() + ) + } + } 16 => { if let DataType::Decimal128(_, s) = self.data_type() { *s @@ -1636,7 +1720,7 @@ impl PrimitiveArray { #[cfg(test)] mod tests { use super::*; - use crate::builder::{Decimal128Builder, Decimal256Builder}; + use crate::builder::{Decimal32Builder, Decimal64Builder, Decimal128Builder, Decimal256Builder}; use crate::cast::downcast_array; use crate::BooleanArray; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; @@ -2246,6 +2330,42 @@ mod tests { let _ = PrimitiveArray::::from(foo.into_data()); } + #[test] + fn test_decimal32() { + let values: Vec<_> = vec![0, 1, -1, i32::MIN, i32::MAX]; + let array: PrimitiveArray = + PrimitiveArray::from_iter(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array: PrimitiveArray = + PrimitiveArray::from_iter_values(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(values.clone()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(array.to_data()); + assert_eq!(array.values(), &values); + } + + #[test] + fn test_decimal64() { + let values: Vec<_> = vec![0, 1, -1, i64::MIN, i64::MAX]; + let array: PrimitiveArray = + PrimitiveArray::from_iter(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array: PrimitiveArray = + PrimitiveArray::from_iter_values(values.iter().copied()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(values.clone()); + assert_eq!(array.values(), &values); + + let array = PrimitiveArray::::from(array.to_data()); + assert_eq!(array.values(), &values); + } + #[test] fn test_decimal128() { let values: Vec<_> = vec![0, 1, -1, i128::MIN, i128::MAX]; @@ -2517,6 +2637,74 @@ mod tests { assert!(!array.is_null(2)); } + #[test] + fn test_decimal64_iter() { + let mut builder = Decimal64Builder::with_capacity(30); + let decimal1 = 12345; + builder.append_value(decimal1); + + builder.append_null(); + + let decimal2 = 56789; + builder.append_value(decimal2); + + let array: Decimal64Array = builder.finish().with_precision_and_scale(18, 4).unwrap(); + + let collected: Vec<_> = array.iter().collect(); + assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); + } + + #[test] + fn test_from_iter_decimal64array() { + let value1 = 12345; + let value2 = 56789; + + let mut array: Decimal64Array = + vec![Some(value1), None, Some(value2)].into_iter().collect(); + array = array.with_precision_and_scale(18, 4).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal64(18, 4)); + assert_eq!(value1, array.value(0)); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!(value2, array.value(2)); + assert!(!array.is_null(2)); + } + + #[test] + fn test_decimal32_iter() { + let mut builder = Decimal32Builder::with_capacity(30); + let decimal1 = 12345; + builder.append_value(decimal1); + + builder.append_null(); + + let decimal2 = 56789; + builder.append_value(decimal2); + + let array: Decimal32Array = builder.finish().with_precision_and_scale(9, 2).unwrap(); + + let collected: Vec<_> = array.iter().collect(); + assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); + } + + #[test] + fn test_from_iter_decimal32array() { + let value1 = 12345; + let value2 = 56789; + + let mut array: Decimal32Array = + vec![Some(value1), None, Some(value2)].into_iter().collect(); + array = array.with_precision_and_scale(9, 2).unwrap(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal32(9, 2)); + assert_eq!(value1, array.value(0)); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!(value2, array.value(2)); + assert!(!array.is_null(2)); + } + #[test] fn test_unary_opt() { let array = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]); diff --git a/arrow-array/src/builder/buffer_builder.rs b/arrow-array/src/builder/buffer_builder.rs index ab67669febb8..64c9f8db1e50 100644 --- a/arrow-array/src/builder/buffer_builder.rs +++ b/arrow-array/src/builder/buffer_builder.rs @@ -43,6 +43,10 @@ pub type Float32BufferBuilder = BufferBuilder; /// Buffer builder for 64-bit floating point type. pub type Float64BufferBuilder = BufferBuilder; +/// Buffer builder for 32-bit decimal type. +pub type Decimal32BufferBuilder = BufferBuilder<::Native>; +/// Buffer builder for 64-bit decimal type. +pub type Decimal64BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for 128-bit decimal type. pub type Decimal128BufferBuilder = BufferBuilder<::Native>; /// Buffer builder for 256-bit decimal type. diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 3191fea6e407..0b987d5e8d0b 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -87,6 +87,10 @@ pub type DurationMicrosecondBuilder = PrimitiveBuilder; /// An elapsed time in nanoseconds array builder. pub type DurationNanosecondBuilder = PrimitiveBuilder; +/// A decimal 32 array builder +pub type Decimal32Builder = PrimitiveBuilder; +/// A decimal 64 array builder +pub type Decimal64Builder = PrimitiveBuilder; /// A decimal 128 array builder pub type Decimal128Builder = PrimitiveBuilder; /// A decimal 256 array builder @@ -175,7 +179,8 @@ impl PrimitiveBuilder { /// data type of the generated array. /// /// This method allows overriding the data type, to allow specifying timezones - /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal128`] and [`DataType::Decimal256`] + /// for [`DataType::Timestamp`] or precision and scale for [`DataType::Decimal32`], + /// [`DataType::Decimal64`], [`DataType::Decimal128`] and [`DataType::Decimal256`] /// /// # Panics /// diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index c0e49b939f2c..8b13c1080210 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -186,6 +186,12 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(FixedSizeBinaryBuilder::with_capacity(capacity, *len)) } + DataType::Decimal32(p, s) => Box::new( + Decimal32Builder::with_capacity(capacity).with_data_type(DataType::Decimal32(*p, *s)), + ), + DataType::Decimal64(p, s) => Box::new( + Decimal64Builder::with_capacity(capacity).with_data_type(DataType::Decimal64(*p, *s)), + ), DataType::Decimal128(p, s) => Box::new( Decimal128Builder::with_capacity(capacity).with_data_type(DataType::Decimal128(*p, *s)), ), diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 92262fc04a57..cf1e91556378 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -25,13 +25,17 @@ use crate::timezone::Tz; use crate::{ArrowNativeTypeOp, OffsetSizeTrait}; use arrow_buffer::{i256, Buffer, OffsetBuffer}; use arrow_data::decimal::{ - is_validate_decimal256_precision, is_validate_decimal_precision, validate_decimal256_precision, - validate_decimal_precision, + is_validate_decimal256_precision, is_validate_decimal_precision, + is_validate_decimal64_precision, is_validate_decimal32_precision, + validate_decimal256_precision, validate_decimal_precision, + validate_decimal64_precision, validate_decimal32_precision, }; use arrow_data::{validate_binary_view, validate_string_view}; use arrow_schema::{ - ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, - DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, + ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, + DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL32_DEFAULT_SCALE, DECIMAL64_DEFAULT_SCALE, + DECIMAL_DEFAULT_SCALE, }; use chrono::{Duration, NaiveDate, NaiveDateTime}; use half::f16; @@ -1162,6 +1166,8 @@ mod decimal { use super::*; pub trait DecimalTypeSealed {} + impl DecimalTypeSealed for Decimal32Type {} + impl DecimalTypeSealed for Decimal64Type {} impl DecimalTypeSealed for Decimal128Type {} impl DecimalTypeSealed for Decimal256Type {} } @@ -1169,10 +1175,12 @@ mod decimal { /// A trait over the decimal types, used by [`PrimitiveArray`] to provide a generic /// implementation across the various decimal types /// -/// Implemented by [`Decimal128Type`] and [`Decimal256Type`] for [`Decimal128Array`] -/// and [`Decimal256Array`] respectively +/// Implemented by [`Decimal32Type`], [`Decimal64Type`], [`Decimal128Type`] and [`Decimal256Type`] +/// for [`Decimal32Array`], [`Decimal64Array`], [`Decimal128Array`] and [`Decimal256Array`] respectively /// /// [`PrimitiveArray`]: crate::array::PrimitiveArray +/// [`Decimal32Array`]: crate::array::Decimal32Array +/// [`Decimal64Array`]: crate::array::Decimal64Array /// [`Decimal128Array`]: crate::array::Decimal128Array /// [`Decimal256Array`]: crate::array::Decimal256Array pub trait DecimalType: @@ -1189,7 +1197,7 @@ pub trait DecimalType: /// Default values for [`DataType`] const DEFAULT_TYPE: DataType; - /// "Decimal128" or "Decimal256", for use in error messages + /// "Decima32", "Decimal64", "Decimal128" or "Decimal256", for use in error messages const PREFIX: &'static str; /// Formats the decimal value with the provided precision and scale @@ -1242,6 +1250,74 @@ pub fn validate_decimal_precision_and_scale( Ok(()) } +/// The decimal type for a Decimal32Array +#[derive(Debug)] +pub struct Decimal32Type {} + +impl DecimalType for Decimal32Type { + const BYTE_LENGTH: usize = 4; + const MAX_PRECISION: u8 = DECIMAL32_MAX_PRECISION; + const MAX_SCALE: i8 = DECIMAL32_MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal32; + const DEFAULT_TYPE: DataType = + DataType::Decimal32(DECIMAL32_MAX_PRECISION, DECIMAL32_DEFAULT_SCALE); + const PREFIX: &'static str = "Decimal32"; + + fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale) + } + + fn validate_decimal_precision(num: i32, precision: u8) -> Result<(), ArrowError> { + validate_decimal32_precision(num, precision) + } + + fn is_valid_decimal_precision(value: Self::Native, precision: u8) -> bool { + is_validate_decimal32_precision(value, precision) + } +} + +impl ArrowPrimitiveType for Decimal32Type { + type Native = i32; + + const DATA_TYPE: DataType = ::DEFAULT_TYPE; +} + +impl primitive::PrimitiveTypeSealed for Decimal32Type {} + +/// The decimal type for a Decimal64Array +#[derive(Debug)] +pub struct Decimal64Type {} + +impl DecimalType for Decimal64Type { + const BYTE_LENGTH: usize = 8; + const MAX_PRECISION: u8 = DECIMAL64_MAX_PRECISION; + const MAX_SCALE: i8 = DECIMAL64_MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal64; + const DEFAULT_TYPE: DataType = + DataType::Decimal64(DECIMAL64_MAX_PRECISION, DECIMAL64_DEFAULT_SCALE); + const PREFIX: &'static str = "Decimal64"; + + fn format_decimal(value: Self::Native, precision: u8, scale: i8) -> String { + format_decimal_str(&value.to_string(), precision as usize, scale) + } + + fn validate_decimal_precision(num: i64, precision: u8) -> Result<(), ArrowError> { + validate_decimal64_precision(num, precision) + } + + fn is_valid_decimal_precision(value: Self::Native, precision: u8) -> bool { + is_validate_decimal64_precision(value, precision) + } +} + +impl ArrowPrimitiveType for Decimal64Type { + type Native = i64; + + const DATA_TYPE: DataType = ::DEFAULT_TYPE; +} + +impl primitive::PrimitiveTypeSealed for Decimal64Type {} + /// The decimal type for a Decimal128Array #[derive(Debug)] pub struct Decimal128Type {} @@ -1613,6 +1689,8 @@ mod tests { test_layout::(); test_layout::(); test_layout::(); + test_layout::(); + test_layout::(); test_layout::(); test_layout::(); test_layout::(); diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 7abadf5793b3..2b6881d6d5fa 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -168,24 +168,25 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { _ => false }, // cast one decimal type to another decimal type - (Decimal128(_, _), Decimal128(_, _)) => true, - (Decimal256(_, _), Decimal256(_, _)) => true, - (Decimal128(_, _), Decimal256(_, _)) => true, - (Decimal256(_, _), Decimal128(_, _)) => true, + (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) | // unsigned integer to decimal - (UInt8 | UInt16 | UInt32 | UInt64, Decimal128(_, _)) | - (UInt8 | UInt16 | UInt32 | UInt64, Decimal256(_, _)) | + (UInt8 | UInt16 | UInt32 | UInt64, + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) | // signed numeric to decimal - (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | - (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal256(_, _)) | + (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, + Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) | // decimal to unsigned numeric - (Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | + (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric - (Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, + (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | // decimal to Utf8 - (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, + (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), + Utf8 | LargeUtf8) | // Utf8 to decimal - (Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, + (Utf8 | LargeUtf8, Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) => true, (Struct(from_fields), Struct(to_fields)) => { from_fields.len() == to_fields.len() && from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index df96816ea23a..9ac5da644cf0 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -474,7 +474,7 @@ macro_rules! decimal_display { }; } -decimal_display!(Decimal128Type, Decimal256Type); +decimal_display!(Decimal32Type, Decimal64Type, Decimal128Type, Decimal256Type); fn write_timestamp( f: &mut dyn Write, diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 8af2a91cf159..e5e51a0e75b6 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -144,7 +144,7 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { [empty_buffer, MutableBuffer::new(0)] } - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ + DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, ], @@ -1587,6 +1587,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataTypeLayout::new_fixed_width::() } DataType::Duration(_) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::(), DataType::FixedSizeBinary(size) => { diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index fe19db641236..9c629cb49ff8 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -23,8 +23,9 @@ use arrow_buffer::i256; use arrow_schema::ArrowError; pub use arrow_schema::{ + DECIMAL32_MAX_PRECISION, DECIMAL32_MAX_SCALE, DECIMAL64_MAX_PRECISION, DECIMAL64_MAX_SCALE, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, - DECIMAL_DEFAULT_SCALE, + DECIMAL32_DEFAULT_SCALE, DECIMAL64_DEFAULT_SCALE, DECIMAL_DEFAULT_SCALE, }; /// MAX decimal256 value of little-endian format for each precision. @@ -833,9 +834,161 @@ pub(crate) const MIN_DECIMAL_FOR_EACH_PRECISION_ONE_BASED: [i128; 39] = [ -99999999999999999999999999999999999999, ]; -/// Validates that the specified `i128` value can be properly +/// `MAX_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[p]` holds the maximum `i64` value that can +/// be stored in [arrow_schema::DataType::Decimal64] value of precision `p`. +/// The first element is unused and is inserted so that we can look up using +/// precision as the index without the need to subtract 1 first. +pub(crate) const MAX_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED: [i64; 19] = [ + 0, // unused first element + 9, + 99, + 999, + 9999, + 99999, + 999999, + 9999999, + 99999999, + 999999999, + 9999999999, + 99999999999, + 999999999999, + 9999999999999, + 99999999999999, + 999999999999999, + 9999999999999999, + 99999999999999999, + 999999999999999999, +]; + +/// `MIN_DECIMAL64_FOR_EACH_PRECISION[p]` holds the minimum `i64` value that can +/// be stored in a [arrow_schema::DataType::Decimal64] value of precision `p`. +/// The first element is unused and is inserted so that we can look up using +/// precision as the index without the need to subtract 1 first. +pub(crate) const MIN_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED: [i64; 19] = [ + 0, // unused first element + -9, + -99, + -999, + -9999, + -99999, + -999999, + -9999999, + -99999999, + -999999999, + -9999999999, + -99999999999, + -999999999999, + -9999999999999, + -99999999999999, + -999999999999999, + -9999999999999999, + -99999999999999999, + -999999999999999999, +]; + +/// `MAX_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[p]` holds the maximum `i32` value that can +/// be stored in [arrow_schema::DataType::Decimal32] value of precision `p`. +/// The first element is unused and is inserted so that we can look up using +/// precision as the index without the need to subtract 1 first. +pub(crate) const MAX_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED: [i32; 10] = [ + 0, // unused first element + 9, + 99, + 999, + 9999, + 99999, + 999999, + 9999999, + 99999999, + 999999999, +]; + +/// `MIN_DECIMAL32_FOR_EACH_PRECISION[p]` holds the minimum `i32` value that can +/// be stored in a [arrow_schema::DataType::Decimal32] value of precision `p`. +/// The first element is unused and is inserted so that we can look up using +/// precision as the index without the need to subtract 1 first. +pub(crate) const MIN_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED: [i32; 10] = [ + 0, // unused first element + -9, + -99, + -999, + -9999, + -99999, + -999999, + -9999999, + -99999999, + -999999999, +]; + +/// Validates that the specified `i32` value can be properly /// interpreted as a Decimal number with precision `precision` #[inline] +pub fn validate_decimal32_precision(value: i32, precision: u8) -> Result<(), ArrowError> { + if precision > DECIMAL32_MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "Max precision of a Decimal32 is {DECIMAL32_MAX_PRECISION}, but got {precision}", + ))); + } + if value > MAX_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] { + Err(ArrowError::InvalidArgumentError(format!( + "{value} is too large to store in a Decimal32 of precision {precision}. Max is {}", + MAX_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + ))) + } else if value < MIN_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] { + Err(ArrowError::InvalidArgumentError(format!( + "{value} is too small to store in a Decimal32 of precision {precision}. Min is {}", + MIN_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + ))) + } else { + Ok(()) + } +} + +/// Determines whether the specified `i32` value can be properly +/// interpreted as a Decimal number with precision `precision` +#[inline] +pub fn is_validate_decimal32_precision(value: i32, precision: u8) -> bool { + precision <= DECIMAL32_MAX_PRECISION + && value >= MIN_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + && value <= MAX_DECIMAL32_FOR_EACH_PRECISION_ONE_BASED[precision as usize] +} + +/// Validates that the specified `i64` value can be properly +/// interpreted as a Decimal64 number with precision `precision` +#[inline] +pub fn validate_decimal64_precision(value: i64, precision: u8) -> Result<(), ArrowError> { + if precision > DECIMAL64_MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "Max precision of a Decimal64 is {DECIMAL64_MAX_PRECISION}, but got {precision}", + ))); + } + if value > MAX_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] { + Err(ArrowError::InvalidArgumentError(format!( + "{value} is too large to store in a Decimal64 of precision {precision}. Max is {}", + MAX_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + ))) + } else if value < MIN_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] { + Err(ArrowError::InvalidArgumentError(format!( + "{value} is too small to store in a Decimal64 of precision {precision}. Min is {}", + MIN_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + ))) + } else { + Ok(()) + } +} + +/// Determines whether the specified `i64` value can be properly +/// interpreted as a Decimal64 number with precision `precision` +#[inline] +pub fn is_validate_decimal64_precision(value: i64, precision: u8) -> bool { + precision <= DECIMAL64_MAX_PRECISION + && value >= MIN_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] + && value <= MAX_DECIMAL64_FOR_EACH_PRECISION_ONE_BASED[precision as usize] +} + +/// Validates that the specified `i128` value can be properly +/// interpreted as a Decimal128 number with precision `precision` +#[inline] pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), ArrowError> { if precision > DECIMAL128_MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( @@ -858,7 +1011,7 @@ pub fn validate_decimal_precision(value: i128, precision: u8) -> Result<(), Arro } /// Determines whether the specified `i128` value can be properly -/// interpreted as a Decimal number with precision `precision` +/// interpreted as a Decimal128 number with precision `precision` #[inline] pub fn is_validate_decimal_precision(value: i128, precision: u8) -> bool { precision <= DECIMAL128_MAX_PRECISION diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index f24179b61700..1c16ee2f8a14 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -78,6 +78,8 @@ fn equal_values( DataType::Int64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float32 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Float64 => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Decimal32(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::Decimal64(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Decimal128(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Decimal256(_, _) => primitive_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => { diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index c74b0c43481a..f9338c5d8337 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -256,6 +256,8 @@ fn build_extend(array: &ArrayData) -> Extend { | DataType::Duration(_) | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::(array), DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::(array), + DataType::Decimal32(_, _) => primitive::build_extend::(array), + DataType::Decimal64(_, _) => primitive::build_extend::(array), DataType::Decimal128(_, _) => primitive::build_extend::(array), DataType::Decimal256(_, _) => primitive::build_extend::(array), DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), @@ -302,6 +304,8 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { | DataType::Duration(_) | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, + DataType::Decimal32(_, _) => primitive::extend_nulls::, + DataType::Decimal64(_, _) => primitive::extend_nulls::, DataType::Decimal128(_, _) => primitive::extend_nulls::, DataType::Decimal256(_, _) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, @@ -455,7 +459,9 @@ impl<'a> MutableArrayData<'a> { }; let child_data = match &data_type { - DataType::Decimal128(_, _) + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) | DataType::Null | DataType::Boolean diff --git a/arrow-integration-test/src/datatype.rs b/arrow-integration-test/src/datatype.rs index e45e94c24e07..4c17fbe76be7 100644 --- a/arrow-integration-test/src/datatype.rs +++ b/arrow-integration-test/src/datatype.rs @@ -60,14 +60,14 @@ pub fn data_type_from_json(json: &serde_json::Value) -> Result { _ => 128, // Default bit width }; - if bit_width == 128 { - Ok(DataType::Decimal128(precision, scale)) - } else if bit_width == 256 { - Ok(DataType::Decimal256(precision, scale)) - } else { - Err(ArrowError::ParseError( + match bit_width { + 32 => Ok(DataType::Decimal32(precision, scale)), + 64 => Ok(DataType::Decimal64(precision, scale)), + 128 => Ok(DataType::Decimal128(precision, scale)), + 256 => Ok(DataType::Decimal256(precision, scale)), + _ => Err(ArrowError::ParseError( "Decimal bit_width invalid".to_string(), - )) + )), } } Some(s) if s == "floatingpoint" => match map.get("precision") { @@ -337,6 +337,12 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { TimeUnit::Nanosecond => "NANOSECOND", }}), DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), + DataType::Decimal32(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 32}) + } + DataType::Decimal64(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 64}) + } DataType::Decimal128(precision, scale) => { json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) } diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index ea5b545f2e81..2dff479d484b 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -812,6 +812,42 @@ pub fn array_from_json( ))), } } + DataType::Decimal32(precision, scale) => { + let mut b = Decimal32Builder::with_capacity(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap().parse::().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new( + b.finish().with_precision_and_scale(*precision, *scale)?, + )) + } + DataType::Decimal64(precision, scale) => { + let mut b = Decimal64Builder::with_capacity(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap().parse::().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new( + b.finish().with_precision_and_scale(*precision, *scale)?, + )) + } DataType::Decimal128(precision, scale) => { let mut b = Decimal128Builder::with_capacity(json_col.count); for (is_valid, value) in json_col diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index eef236529e10..1ed4c17e5bde 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -453,18 +453,14 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat crate::Type::Decimal => { let fsb = field.type_as_decimal().unwrap(); let bit_width = fsb.bitWidth(); - if bit_width == 128 { - DataType::Decimal128( - fsb.precision().try_into().unwrap(), - fsb.scale().try_into().unwrap(), - ) - } else if bit_width == 256 { - DataType::Decimal256( - fsb.precision().try_into().unwrap(), - fsb.scale().try_into().unwrap(), - ) - } else { - panic!("Unexpected decimal bit width {bit_width}") + let precision: u8 = fsb.precision().try_into().unwrap(); + let scale: i8 = fsb.scale().try_into().unwrap(); + match bit_width { + 32 => DataType::Decimal32(precision, scale), + 64 => DataType::Decimal64(precision, scale), + 128 => DataType::Decimal128(precision, scale), + 256 => DataType::Decimal256(precision, scale), + _ => panic!("Unexpected decimal bit width {bit_width}"), } } crate::Type::Union => { @@ -830,6 +826,28 @@ pub(crate) fn get_fb_field_type<'a>( // type in the DictionaryEncoding metadata in the parent field get_fb_field_type(value_type, dictionary_tracker, fbb) } + Decimal32(precision, scale) => { + let mut builder = crate::DecimalBuilder::new(fbb); + builder.add_precision(*precision as i32); + builder.add_scale(*scale as i32); + builder.add_bitWidth(32); + FBFieldType { + type_type: crate::Type::Decimal, + type_: builder.finish().as_union_value(), + children: Some(fbb.create_vector(&empty_fields[..])), + } + } + Decimal64(precision, scale) => { + let mut builder = crate::DecimalBuilder::new(fbb); + builder.add_precision(*precision as i32); + builder.add_scale(*scale as i32); + builder.add_bitWidth(64); + FBFieldType { + type_type: crate::Type::Decimal, + type_: builder.finish().as_union_value(), + children: Some(fbb.create_vector(&empty_fields[..])), + } + } Decimal128(precision, scale) => { let mut builder = crate::DecimalBuilder::new(fbb); builder.add_precision(*precision as i32); diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index ff5832dfa68c..1fa24847899c 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -331,6 +331,34 @@ pub enum DataType { /// This type mostly used to represent low cardinality string /// arrays or a limited set of primitive types as integers. Dictionary(Box, Box), + /// Exact 32-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. + Decimal32(u8, i8), + /// Exact 64-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. + Decimal64(u8, i8), /// Exact 128-bit width decimal value with precision and scale /// /// * precision is the total number of digits @@ -489,6 +517,8 @@ impl DataType { | Float16 | Float32 | Float64 + | Decimal32(_, _) + | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _) ) @@ -641,6 +671,8 @@ impl DataType { DataType::Interval(IntervalUnit::YearMonth) => Some(4), DataType::Interval(IntervalUnit::DayTime) => Some(8), DataType::Interval(IntervalUnit::MonthDayNano) => Some(16), + DataType::Decimal32(_, _) => Some(4), + DataType::Decimal64(_, _) => Some(8), DataType::Decimal128(_, _) => Some(16), DataType::Decimal256(_, _) => Some(32), DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => None, @@ -691,6 +723,8 @@ impl DataType { | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => 0, DataType::Timestamp(_, s) => s.as_ref().map(|s| s.len()).unwrap_or_default(), @@ -764,6 +798,18 @@ impl DataType { } } +/// The maximum precision for [DataType::Decimal32] values +pub const DECIMAL32_MAX_PRECISION: u8 = 9; + +/// The maximum scale for [DataType::Decimal32] values +pub const DECIMAL32_MAX_SCALE: i8 = 9; + +/// The maximum precision for [DataType::Decimal64] values +pub const DECIMAL64_MAX_PRECISION: u8 = 18; + +/// The maximum scale for [DataType::Decimal64] values +pub const DECIMAL64_MAX_SCALE: i8 = 18; + /// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; @@ -776,6 +822,12 @@ pub const DECIMAL256_MAX_PRECISION: u8 = 76; /// The maximum scale for [DataType::Decimal256] values pub const DECIMAL256_MAX_SCALE: i8 = 76; +/// The default scale for [DataType::Decimal32] values +pub const DECIMAL32_DEFAULT_SCALE: i8 = 2; + +/// The default scale for [DataType::Decimal64] values +pub const DECIMAL64_DEFAULT_SCALE: i8 = 6; + /// The default scale for [DataType::Decimal128] and [DataType::Decimal256] /// values pub const DECIMAL_DEFAULT_SCALE: i8 = 10; diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 40d411ba27ca..0ea0153f43e3 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -72,6 +72,8 @@ impl<'a> Parser<'a> { Token::Duration => self.parse_duration(), Token::Interval => self.parse_interval(), Token::FixedSizeBinary => self.parse_fixed_size_binary(), + Token::Decimal32 => self.parse_decimal_32(), + Token::Decimal64 => self.parse_decimal_64(), Token::Decimal128 => self.parse_decimal_128(), Token::Decimal256 => self.parse_decimal_256(), Token::Dictionary => self.parse_dictionary(), @@ -259,6 +261,26 @@ impl<'a> Parser<'a> { Ok(DataType::FixedSizeBinary(length)) } + /// Parses the next Decimal32 (called after `Decimal32` has been consumed) + fn parse_decimal_32(&mut self) -> ArrowResult { + self.expect_token(Token::LParen)?; + let precision = self.parse_u8("Decimal32")?; + self.expect_token(Token::Comma)?; + let scale = self.parse_i8("Decimal32")?; + self.expect_token(Token::RParen)?; + Ok(DataType::Decimal32(precision, scale)) + } + + /// Parses the next Decimal64 (called after `Decimal64` has been consumed) + fn parse_decimal_64(&mut self) -> ArrowResult { + self.expect_token(Token::LParen)?; + let precision = self.parse_u8("Decimal64")?; + self.expect_token(Token::Comma)?; + let scale = self.parse_i8("Decimal64")?; + self.expect_token(Token::RParen)?; + Ok(DataType::Decimal64(precision, scale)) + } + /// Parses the next Decimal128 (called after `Decimal128` has been consumed) fn parse_decimal_128(&mut self) -> ArrowResult { self.expect_token(Token::LParen)?; @@ -469,6 +491,9 @@ impl<'a> Tokenizer<'a> { "Dictionary" => Token::Dictionary, "FixedSizeBinary" => Token::FixedSizeBinary, + + "Decimal32" => Token::Decimal32, + "Decimal64" => Token::Decimal64, "Decimal128" => Token::Decimal128, "Decimal256" => Token::Decimal256, @@ -531,6 +556,8 @@ enum Token { Duration, Interval, FixedSizeBinary, + Decimal32, + Decimal64, Decimal128, Decimal256, Dictionary, @@ -568,6 +595,8 @@ impl Display for Token { Token::Some => write!(f, "Some"), Token::None => write!(f, "None"), Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), + Token::Decimal32 => write!(f, "Decimal32"), + Token::Decimal64 => write!(f, "Decimal64"), Token::Decimal128 => write!(f, "Decimal128"), Token::Decimal256 => write!(f, "Decimal256"), Token::Dictionary => write!(f, "Dictionary"), @@ -658,6 +687,8 @@ mod test { DataType::Utf8, DataType::Utf8View, DataType::LargeUtf8, + DataType::Decimal32(7, 8), + DataType::Decimal64(6, 9), DataType::Decimal128(7, 12), DataType::Decimal256(6, 13), // --------- @@ -750,8 +781,12 @@ mod test { // too large for i32 ("FixedSizeBinary(4000000000), ", "Error converting 4000000000 into i32 for FixedSizeBinary: out of range integral type conversion attempted"), // can't have negative precision + ("Decimal32(-3, 5)", "Error converting -3 into u8 for Decimal32: out of range integral type conversion attempted"), + ("Decimal64(-3, 5)", "Error converting -3 into u8 for Decimal64: out of range integral type conversion attempted"), ("Decimal128(-3, 5)", "Error converting -3 into u8 for Decimal128: out of range integral type conversion attempted"), ("Decimal256(-3, 5)", "Error converting -3 into u8 for Decimal256: out of range integral type conversion attempted"), + ("Decimal32(3, 500)", "Error converting 500 into i8 for Decimal32: out of range integral type conversion attempted"), + ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"), ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"), ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"), diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index e12c37da4898..dc49fb68bac5 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -504,9 +504,6 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { DataType::Decimal128(parsed_precision, parsed_scale) }, [precision, scale, bits] => { - if *bits != "128" && *bits != "256" { - return Err(ArrowError::CDataInterface("Only 128/256 bit wide decimal is supported in the Rust implementation".to_string())); - } let parsed_precision = precision.parse::().map_err(|_| { ArrowError::CDataInterface( "The decimal type requires an integer precision".to_string(), @@ -517,10 +514,13 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer scale".to_string(), ) })?; - if *bits == "128" { - DataType::Decimal128(parsed_precision, parsed_scale) - } else { - DataType::Decimal256(parsed_precision, parsed_scale) + let parsed_bits = bits.parse::().unwrap_or(0); + match parsed_bits { + 32 => DataType::Decimal32(parsed_precision, parsed_scale), + 64 => DataType::Decimal64(parsed_precision, parsed_scale), + 128 => DataType::Decimal128(parsed_precision, parsed_scale), + 256 => DataType::Decimal256(parsed_precision, parsed_scale), + _ => return Err(ArrowError::CDataInterface("Only 32/64/128/256 bit wide decimals are supported in the Rust implementation".to_string())), } } _ => { @@ -703,6 +703,8 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::LargeUtf8 => Ok("U".to_string()), DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{num_bytes}")), DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{num_elems}")), + DataType::Decimal32(precision, scale) => Ok(format!("d:{precision},{scale},32")), + DataType::Decimal64(precision, scale) => Ok(format!("d:{precision},{scale},64")), DataType::Decimal128(precision, scale) => Ok(format!("d:{precision},{scale}")), DataType::Decimal256(precision, scale) => Ok(format!("d:{precision},{scale},256")), DataType::Date32 => Ok("tdD".to_string()), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b532ea8616b6..f9545590966a 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -544,6 +544,8 @@ impl Field { | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { if from.data_type == DataType::Null { diff --git a/arrow/benches/array_from_vec.rs b/arrow/benches/array_from_vec.rs index fd83ad5c2a10..86705eb7fb43 100644 --- a/arrow/benches/array_from_vec.rs +++ b/arrow/benches/array_from_vec.rs @@ -73,6 +73,28 @@ fn struct_array_from_vec( criterion::black_box(StructArray::try_from(vec![(field1, strings), (field2, ints)]).unwrap()); } +fn decimal32_array_from_vec(array: &[Option]) { + criterion::black_box( + array + .iter() + .copied() + .collect::() + .with_precision_and_scale(9, 2) + .unwrap(), + ); +} + +fn decimal64_array_from_vec(array: &[Option]) { + criterion::black_box( + array + .iter() + .copied() + .collect::() + .with_precision_and_scale(17, 2) + .unwrap(), + ); +} + fn decimal128_array_from_vec(array: &[Option]) { criterion::black_box( array @@ -96,6 +118,30 @@ fn decimal256_array_from_vec(array: &[Option]) { } fn decimal_benchmark(c: &mut Criterion) { + // bench decimal32 array + // create option array + let size: usize = 1 << 15; + let mut rng = rand::thread_rng(); + let mut array = vec![]; + for _ in 0..size { + array.push(Some(rng.gen_range::(0..99999999))); + } + c.bench_function("decimal32_array_from_vec 32768", |b| { + b.iter(|| decimal32_array_from_vec(array.as_slice())) + }); + + // bench decimal64 array + // create option array + let size: usize = 1 << 15; + let mut rng = rand::thread_rng(); + let mut array = vec![]; + for _ in 0..size { + array.push(Some(rng.gen_range::(0..9999999999))); + } + c.bench_function("decimal64_array_from_vec 32768", |b| { + b.iter(|| decimal64_array_from_vec(array.as_slice())) + }); + // bench decimal128 array // create option array let size: usize = 1 << 15; diff --git a/arrow/benches/builder.rs b/arrow/benches/builder.rs index 87a02e7ad1fd..312627972e54 100644 --- a/arrow/benches/builder.rs +++ b/arrow/benches/builder.rs @@ -107,6 +107,42 @@ fn bench_string(c: &mut Criterion) { group.finish(); } +fn bench_decimal32(c: &mut Criterion) { + c.bench_function("bench_decimal32_builder", |b| { + b.iter(|| { + let mut rng = rand::thread_rng(); + let mut decimal_builder = Decimal32Builder::with_capacity(BATCH_SIZE); + for _ in 0..BATCH_SIZE { + decimal_builder.append_value(rng.gen_range::(0..999999999)); + } + black_box( + decimal_builder + .finish() + .with_precision_and_scale(9, 0) + .unwrap(), + ); + }) + }); +} + +fn bench_decimal64(c: &mut Criterion) { + c.bench_function("bench_decimal64_builder", |b| { + b.iter(|| { + let mut rng = rand::thread_rng(); + let mut decimal_builder = Decimal64Builder::with_capacity(BATCH_SIZE); + for _ in 0..BATCH_SIZE { + decimal_builder.append_value(rng.gen_range::(0..9999999999)); + } + black_box( + decimal_builder + .finish() + .with_precision_and_scale(18, 0) + .unwrap(), + ); + }) + }); +} + fn bench_decimal128(c: &mut Criterion) { c.bench_function("bench_decimal128_builder", |b| { b.iter(|| { @@ -126,7 +162,7 @@ fn bench_decimal128(c: &mut Criterion) { } fn bench_decimal256(c: &mut Criterion) { - c.bench_function("bench_decimal128_builder", |b| { + c.bench_function("bench_decimal256_builder", |b| { b.iter(|| { let mut rng = rand::thread_rng(); let mut decimal_builder = Decimal256Builder::with_capacity(BATCH_SIZE); @@ -150,6 +186,8 @@ criterion_group!( bench_primitive_nulls, bench_bool, bench_string, + bench_decimal32, + bench_decimal64, bench_decimal128, bench_decimal256, ); diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ec7990d3d764..dab09a46284a 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -82,6 +82,36 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { Arc::new(builder.finish()) } +fn build_decimal32_array(size: usize, precision: u8, scale: i8) -> ArrayRef { + let mut rng = seedable_rng(); + let mut builder = Decimal32Builder::with_capacity(size); + + for _ in 0..size { + builder.append_value(rng.gen_range::(0..10000000)); + } + Arc::new( + builder + .finish() + .with_precision_and_scale(precision, scale) + .unwrap(), + ) +} + +fn build_decimal64_array(size: usize, precision: u8, scale: i8) -> ArrayRef { + let mut rng = seedable_rng(); + let mut builder = Decimal64Builder::with_capacity(size); + + for _ in 0..size { + builder.append_value(rng.gen_range::(0..1000000000)); + } + Arc::new( + builder + .finish() + .with_precision_and_scale(precision, scale) + .unwrap(), + ) +} + fn build_decimal128_array(size: usize, precision: u8, scale: i8) -> ArrayRef { let mut rng = seedable_rng(); let mut builder = Decimal128Builder::with_capacity(size); @@ -158,6 +188,8 @@ fn add_benchmark(c: &mut Criterion) { let utf8_date_array = build_utf8_date_array(512, true); let utf8_date_time_array = build_utf8_date_time_array(512, true); + let decimal32_array = build_decimal32_array(512, 9, 3); + let decimal64_array = build_decimal64_array(512, 10, 3); let decimal128_array = build_decimal128_array(512, 10, 3); let decimal256_array = build_decimal256_array(512, 50, 3); let string_array = build_string_array(512); @@ -247,6 +279,8 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| cast_array(&utf8_date_time_array, DataType::Date64)) }); +// TODO: decimal32, decimal64 + c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs index be812a225ca2..88086f9a8720 100644 --- a/arrow/benches/decimal_validate.rs +++ b/arrow/benches/decimal_validate.rs @@ -18,7 +18,12 @@ #[macro_use] extern crate criterion; -use arrow::array::{Array, Decimal128Array, Decimal128Builder, Decimal256Array, Decimal256Builder}; +use arrow::array::{Array, + Decimal32Array, Decimal32Builder, + Decimal64Array, Decimal64Builder, + Decimal128Array, Decimal128Builder, + Decimal256Array, Decimal256Builder +}; use criterion::Criterion; use rand::Rng; @@ -26,6 +31,14 @@ extern crate arrow; use arrow_buffer::i256; +fn validate_decimal32_array(array: Decimal32Array) { + array.with_precision_and_scale(8, 0).unwrap(); +} + +fn validate_decimal64_array(array: Decimal64Array) { + array.with_precision_and_scale(16, 0).unwrap(); +} + fn validate_decimal128_array(array: Decimal128Array) { array.with_precision_and_scale(35, 0).unwrap(); } @@ -34,6 +47,46 @@ fn validate_decimal256_array(array: Decimal256Array) { array.with_precision_and_scale(35, 0).unwrap(); } +fn validate_decimal32_benchmark(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + let size: i32 = 20000; + let mut decimal_builder = Decimal32Builder::with_capacity(size as usize); + for _ in 0..size { + decimal_builder.append_value(rng.gen_range::(0..99999999)); + } + let decimal_array = decimal_builder + .finish() + .with_precision_and_scale(9, 0) + .unwrap(); + let data = decimal_array.into_data(); + c.bench_function("validate_decimal32_array 20000", |b| { + b.iter(|| { + let array = Decimal32Array::from(data.clone()); + validate_decimal32_array(array); + }) + }); +} + +fn validate_decimal64_benchmark(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + let size: i64 = 20000; + let mut decimal_builder = Decimal64Builder::with_capacity(size as usize); + for _ in 0..size { + decimal_builder.append_value(rng.gen_range::(0..999999999999)); + } + let decimal_array = decimal_builder + .finish() + .with_precision_and_scale(18, 0) + .unwrap(); + let data = decimal_array.into_data(); + c.bench_function("validate_decimal64_array 20000", |b| { + b.iter(|| { + let array = Decimal64Array::from(data.clone()); + validate_decimal64_array(array); + }) + }); +} + fn validate_decimal128_benchmark(c: &mut Criterion) { let mut rng = rand::thread_rng(); let size: i128 = 20000; @@ -78,6 +131,8 @@ fn validate_decimal256_benchmark(c: &mut Criterion) { criterion_group!( benches, + validate_decimal32_benchmark, + validate_decimal64_benchmark, validate_decimal128_benchmark, validate_decimal256_benchmark, ); diff --git a/arrow/src/tensor.rs b/arrow/src/tensor.rs index cd135a2f04df..3b65ea7b52f9 100644 --- a/arrow/src/tensor.rs +++ b/arrow/src/tensor.rs @@ -86,6 +86,10 @@ pub type BooleanTensor<'a> = Tensor<'a, BooleanType>; pub type Date32Tensor<'a> = Tensor<'a, Date32Type>; /// [Tensor] of type [Int16Type] pub type Date64Tensor<'a> = Tensor<'a, Date64Type>; +/// [Tensor] of type [Decimal32Type] +pub type Decimal32Tensor<'a> = Tensor<'a, Decimal32Type>; +/// [Tensor] of type [Decimal64Type] +pub type Decimal64Tensor<'a> = Tensor<'a, Decimal64Type>; /// [Tensor] of type [Decimal128Type] pub type Decimal128Tensor<'a> = Tensor<'a, Decimal128Type>; /// [Tensor] of type [Decimal256Type] diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 8a7511be2afe..eaa87353feca 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -33,9 +33,9 @@ use arrow_array::builder::{ }; use arrow_array::{ new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, Decimal256Array, Float16Array, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, LargeBinaryArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float16Array, Float32Array, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; @@ -45,12 +45,24 @@ use half::f16; use paste::paste; use std::sync::Arc; -// Convert the bytes array to i128. +// Convert the bytes array to i32. // The endian of the input bytes array must be big-endian. -pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 { +pub(crate) fn from_bytes_to_i32(b: &[u8]) -> i32 { // The bytes array are from parquet file and must be the big-endian. // The endian is defined by parquet format, and the reference document // https://github.com/apache/parquet-format/blob/54e53e5d7794d383529dd30746378f19a12afd58/src/main/thrift/parquet.thrift#L66 + i32::from_be_bytes(sign_extend_be::<4>(b)) +} + +// Convert the bytes array to i64. +// The endian of the input bytes array must be big-endian. +pub(crate) fn from_bytes_to_i64(b: &[u8]) -> i64 { + i64::from_be_bytes(sign_extend_be::<8>(b)) +} + +// Convert the bytes array to i128. +// The endian of the input bytes array must be big-endian. +pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 { i128::from_be_bytes(sign_extend_be::<16>(b)) } @@ -263,7 +275,7 @@ macro_rules! make_decimal_stats_iterator { s.$func().map(|x| $stat_value_type::from(*x)) } ParquetStatistics::Int64(s) => { - s.$func().map(|x| $stat_value_type::from(*x)) + s.$func().map(|x| $stat_value_type::try_from(*x).ok()).flatten() } ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func), ParquetStatistics::FixedLenByteArray(s) => { @@ -281,6 +293,34 @@ macro_rules! make_decimal_stats_iterator { }; } +make_decimal_stats_iterator!( + MinDecimal32StatsIterator, + min_opt, + min_bytes_opt, + i32, + from_bytes_to_i32 +); +make_decimal_stats_iterator!( + MaxDecimal32StatsIterator, + max_opt, + max_bytes_opt, + i32, + from_bytes_to_i32 +); +make_decimal_stats_iterator!( + MinDecimal64StatsIterator, + min_opt, + min_bytes_opt, + i64, + from_bytes_to_i64 +); +make_decimal_stats_iterator!( + MaxDecimal64StatsIterator, + max_opt, + max_bytes_opt, + i64, + from_bytes_to_i64 +); make_decimal_stats_iterator!( MinDecimal128StatsIterator, min_opt, @@ -474,6 +514,18 @@ macro_rules! get_statistics { } Ok(Arc::new(builder.finish())) }, + DataType::Decimal32(precision, scale) => { + let arr = Decimal32Array::from_iter( + [<$stat_type_prefix Decimal32StatsIterator>]::new($iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, + DataType::Decimal64(precision, scale) => { + let arr = Decimal64Array::from_iter( + [<$stat_type_prefix Decimal64StatsIterator>]::new($iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, DataType::Decimal128(precision, scale) => { let arr = Decimal128Array::from_iter( [<$stat_type_prefix Decimal128StatsIterator>]::new($iterator) @@ -727,7 +779,7 @@ macro_rules! get_decimal_page_stats_iterator { native_index .indexes .iter() - .map(|x| x.$func.and_then(|x| Some($stat_value_type::from(x)))) + .map(|x| x.$func.and_then(|x| $stat_value_type::try_from(x).ok())) .collect::>(), ), Index::BYTE_ARRAY(native_index) => Some( @@ -761,6 +813,34 @@ macro_rules! get_decimal_page_stats_iterator { }; } +get_decimal_page_stats_iterator!( + MinDecimal32DataPageStatsIterator, + min, + i32, + from_bytes_to_i32 +); + +get_decimal_page_stats_iterator!( + MaxDecimal32DataPageStatsIterator, + max, + i32, + from_bytes_to_i32 +); + +get_decimal_page_stats_iterator!( + MinDecimal64DataPageStatsIterator, + min, + i64, + from_bytes_to_i64 +); + +get_decimal_page_stats_iterator!( + MaxDecimal64DataPageStatsIterator, + max, + i64, + from_bytes_to_i64 +); + get_decimal_page_stats_iterator!( MinDecimal128DataPageStatsIterator, min, @@ -954,6 +1034,10 @@ macro_rules! get_data_page_statistics { ) ) ), + DataType::Decimal32(precision, scale) => Ok(Arc::new( + Decimal32Array::from_iter([<$stat_type_prefix Decimal32DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)), + DataType::Decimal64(precision, scale) => Ok(Arc::new( + Decimal64Array::from_iter([<$stat_type_prefix Decimal64DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)), DataType::Decimal128(precision, scale) => Ok(Arc::new( Decimal128Array::from_iter([<$stat_type_prefix Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)), DataType::Decimal256(precision, scale) => Ok(Arc::new( diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 3e828bbddd17..d5ff259044d0 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -87,6 +87,8 @@ fn is_leaf(data_type: &DataType) -> bool { | DataType::Binary | DataType::LargeBinary | DataType::BinaryView + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) | DataType::FixedSizeBinary(_) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 3ec7a3dfea36..47da27decc16 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -827,6 +827,19 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result(); write_primitive(typed, array, levels) } + ArrowDataType::Decimal32(_, _) => { + let array = column + .as_primitive::() + .unary::<_, Int32Type>(|v| v as i32); + write_primitive(typed, array.values(), levels) + } + ArrowDataType::Decimal64(_, _) => { + // use the int32 to represent the decimal with low precision + let array = column + .as_primitive::() + .unary::<_, Int32Type>(|v| v as i32); + write_primitive(typed, array.values(), levels) + } ArrowDataType::Decimal128(_, _) => { // use the int32 to represent the decimal with low precision let array = column @@ -869,6 +882,12 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result(); write_primitive(typed, array, levels) } + ArrowDataType::Decimal64(_, _) => { + let array = column + .as_primitive::() + .unary::<_, Int64Type>(|v| v as i64); + write_primitive(typed, array.values(), levels) + } ArrowDataType::Decimal128(_, _) => { // use the int64 to represent the decimal with low precision let array = column @@ -936,6 +955,14 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result { + let array = column.as_primitive::(); + get_decimal_32_array_slice(array, indices) + } + ArrowDataType::Decimal64(_, _) => { + let array = column.as_primitive::(); + get_decimal_64_array_slice(array, indices) + } ArrowDataType::Decimal128(_, _) => { let array = column.as_primitive::(); get_decimal_128_array_slice(array, indices) @@ -1019,6 +1046,34 @@ fn get_interval_dt_array_slice( values } +fn get_decimal_32_array_slice( + array: &arrow_array::Decimal32Array, + indices: &[usize], +) -> Vec { + let mut values = Vec::with_capacity(indices.len()); + let size = decimal_length_from_precision(array.precision()); + for i in indices { + let as_be_bytes = array.value(*i).to_be_bytes(); + let resized_value = as_be_bytes[(4 - size)..].to_vec(); + values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); + } + values +} + +fn get_decimal_64_array_slice( + array: &arrow_array::Decimal64Array, + indices: &[usize], +) -> Vec { + let mut values = Vec::with_capacity(indices.len()); + let size = decimal_length_from_precision(array.precision()); + for i in indices { + let as_be_bytes = array.value(*i).to_be_bytes(); + let resized_value = as_be_bytes[(8 - size)..].to_vec(); + values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); + } + values +} + fn get_decimal_128_array_slice( array: &arrow_array::Decimal128Array, indices: &[usize], diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 3ed3bd24e0a8..6e6400b4b8a2 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -477,6 +477,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .build(), + DataType::Decimal32(precision, scale) | DataType::Decimal64(precision, scale) | DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { // Decimal precision determines the Parquet physical type to use. // Following the: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal From b653b9b48a117b24e85c4932619c144a43756ccd Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Thu, 31 Oct 2024 07:47:39 -0700 Subject: [PATCH 02/68] small fixes --- arrow-data/src/decimal.rs | 2 +- arrow-json/src/reader/mod.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arrow-data/src/decimal.rs b/arrow-data/src/decimal.rs index 9c629cb49ff8..e0979188db0d 100644 --- a/arrow-data/src/decimal.rs +++ b/arrow-data/src/decimal.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines maximum and minimum values for `decimal256` and `decimal128` types for varying precisions. +//! Defines maximum and minimum values for `decimal256`, `decimal128`, `decimal64` and `decimal32` types for varying precisions. //! //! Also provides functions to validate if a given decimal value is within the valid range of the decimal type. diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index bcacf6f706b8..b38186b7a040 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -691,6 +691,8 @@ fn make_decoder( DataType::Time32(TimeUnit::Millisecond) => primitive_decoder!(Time32MillisecondType, data_type), DataType::Time64(TimeUnit::Microsecond) => primitive_decoder!(Time64MicrosecondType, data_type), DataType::Time64(TimeUnit::Nanosecond) => primitive_decoder!(Time64NanosecondType, data_type), + DataType::Decimal32(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), + DataType::Decimal64(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Decimal128(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Decimal256(p, s) => Ok(Box::new(DecimalArrayDecoder::::new(p, s))), DataType::Boolean => Ok(Box::::default()), @@ -1156,6 +1158,8 @@ mod tests { #[test] fn test_decimals() { + test_decimal::(DataType::Decimal32(8, 2)); + test_decimal::(DataType::Decimal64(10, 2)); test_decimal::(DataType::Decimal128(10, 2)); test_decimal::(DataType::Decimal256(10, 2)); } From 490bfc8a1ff993f4884bfc7d5a34592401805295 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Tue, 17 Dec 2024 18:44:06 -0800 Subject: [PATCH 03/68] more support --- arrow-csv/src/reader/mod.rs | 16 ++++++++++ arrow/benches/cast_kernels.rs | 5 +++- arrow/tests/array_cast.rs | 22 ++++++++++++-- .../array_reader/fixed_len_byte_array.rs | 30 +++++++++++++++++-- .../src/arrow/array_reader/primitive_array.rs | 12 +++++--- 5 files changed, 76 insertions(+), 9 deletions(-) diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index d3d518316397..3a99e651bf3b 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -652,6 +652,22 @@ fn parse( let field = &fields[i]; match field.data_type() { DataType::Boolean => build_boolean_array(line_number, rows, i, null_regex), + DataType::Decimal32(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), + DataType::Decimal64(precision, scale) => build_decimal_array::( + line_number, + rows, + i, + *precision, + *scale, + null_regex, + ), DataType::Decimal128(precision, scale) => build_decimal_array::( line_number, rows, diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 132bfb07d338..da929ae1da74 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -279,7 +279,10 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| cast_array(&utf8_date_time_array, DataType::Date64)) }); -// TODO: decimal32, decimal64 + // TODO: decimal32, decimal64 + c.bench_function("cast decimal32 to decimal32 512", |b| { + b.iter(|| cast_array(&decimal32_array, DataType::Decimal32(8, 2))) + }); c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index ef5ca6041700..bf9962b69f7b 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -18,8 +18,9 @@ use arrow_array::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::{ - ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Int16Type, Int32Type, Int64Type, - Int8Type, TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + ArrowDictionaryKeyType, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, + Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow_array::{ Array, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Date32Array, Date64Array, @@ -262,6 +263,22 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), Arc::new(create_decimal_array(vec![Some(1), Some(2), Some(3)], 38, 0).unwrap()), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), + make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), @@ -501,6 +518,7 @@ fn get_all_types() -> Vec { Dictionary(Box::new(key_type.clone()), Box::new(LargeUtf8)), Dictionary(Box::new(key_type.clone()), Box::new(Binary)), Dictionary(Box::new(key_type.clone()), Box::new(LargeBinary)), + Dictionary(Box::new(key_type.clone()), Box::new(Decimal32(9, 0))), Dictionary(Box::new(key_type.clone()), Box::new(Decimal128(38, 0))), Dictionary(Box::new(key_type), Box::new(Decimal256(76, 0))), ] diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 6b437be943d4..6378cd991e2a 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -27,8 +27,8 @@ use crate::column::reader::decoder::ColumnValueDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow_array::{ - ArrayRef, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, - IntervalDayTimeArray, IntervalYearMonthArray, + ArrayRef, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, + FixedSizeBinaryArray, Float16Array, IntervalDayTimeArray, IntervalYearMonthArray, }; use arrow_buffer::{i256, Buffer, IntervalDayTime}; use arrow_data::ArrayDataBuilder; @@ -64,6 +64,22 @@ pub fn make_fixed_len_byte_array_reader( }; match &data_type { ArrowType::FixedSizeBinary(_) => {} + ArrowType::Decimal32(_, _) => { + if byte_length > 4 { + return Err(general_err!( + "decimal 64 type too large, must be less then 4 bytes, got {}", + byte_length + )); + } + } + ArrowType::Decimal64(_, _) => { + if byte_length > 8 { + return Err(general_err!( + "decimal 32 type too large, must be less then 8 bytes, got {}", + byte_length + )); + } + } ArrowType::Decimal128(_, _) => { if byte_length > 16 { return Err(general_err!( @@ -168,6 +184,16 @@ impl ArrayReader for FixedLenByteArrayReader { // conversion lambdas are all infallible. This improves performance by avoiding a branch in // the inner loop (see docs for `PrimitiveArray::from_unary`). let array: ArrayRef = match &self.data_type { + ArrowType::Decimal32(p, s) => { + let f = |b: &[u8]| i32::from_be_bytes(sign_extend_be(b)); + Arc::new(Decimal32Array::from_unary(&binary, f).with_precision_and_scale(*p, *s)?) + as ArrayRef + } + ArrowType::Decimal64(p, s) => { + let f = |b: &[u8]| i64::from_be_bytes(sign_extend_be(b)); + Arc::new(Decimal64Array::from_unary(&binary, f).with_precision_and_scale(*p, *s)?) + as ArrayRef + } ArrowType::Decimal128(p, s) => { let f = |b: &[u8]| i128::from_be_bytes(sign_extend_be(b)); Arc::new(Decimal128Array::from_unary(&binary, f).with_precision_and_scale(*p, *s)?) diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index a952e00e12ef..375db933b511 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -23,9 +23,9 @@ use crate::column::page::PageIterator; use crate::data_type::{DataType, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use arrow_array::Decimal256Array; use arrow_array::{ - builder::TimestampNanosecondBufferBuilder, ArrayRef, BooleanArray, Decimal128Array, + builder::TimestampNanosecondBufferBuilder, ArrayRef, BooleanArray, + Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array, }; @@ -144,7 +144,8 @@ where // follow C++ implementation and use overflow/reinterpret cast from i32 to u32 which will map // `i32::MIN..0` to `(i32::MAX as u32)..u32::MAX` ArrowType::UInt32 - } + }, + ArrowType::Decimal32(_, _) => target_type.clone(), _ => ArrowType::Int32, } } @@ -154,7 +155,8 @@ where // follow C++ implementation and use overflow/reinterpret cast from i64 to u64 which will map // `i64::MIN..0` to `(i64::MAX as u64)..u64::MAX` ArrowType::UInt64 - } + }, + ArrowType::Decimal64(_, _) => target_type.clone(), _ => ArrowType::Int64, } } @@ -185,11 +187,13 @@ where PhysicalType::INT32 => match array_data.data_type() { ArrowType::UInt32 => Arc::new(UInt32Array::from(array_data)), ArrowType::Int32 => Arc::new(Int32Array::from(array_data)), + ArrowType::Decimal32(_, _) => Arc::new(Decimal32Array::from(array_data)), _ => unreachable!(), }, PhysicalType::INT64 => match array_data.data_type() { ArrowType::UInt64 => Arc::new(UInt64Array::from(array_data)), ArrowType::Int64 => Arc::new(Int64Array::from(array_data)), + ArrowType::Decimal64(_, _) => Arc::new(Decimal64Array::from(array_data)), _ => unreachable!(), }, PhysicalType::FLOAT => Arc::new(Float32Array::from(array_data)), From 3e860ece306e25579567ed13d9c200bd59ad3df5 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Fri, 27 Dec 2024 19:15:34 -0800 Subject: [PATCH 04/68] More support for decimal32 and decimal64 --- arrow-array/src/cast.rs | 6 + arrow-array/src/record_batch.rs | 4 +- arrow-cast/src/cast/decimal.rs | 145 ++- arrow-cast/src/cast/dictionary.rs | 98 +- arrow-cast/src/cast/mod.rs | 932 ++++++++++++------ arrow-csv/src/reader/mod.rs | 48 + arrow-csv/src/writer.rs | 49 +- arrow-data/src/data.rs | 8 +- arrow-json/src/writer/encoder.rs | 2 +- arrow/benches/cast_kernels.rs | 3 + .../array_reader/fixed_len_byte_array.rs | 4 +- 11 files changed, 864 insertions(+), 435 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index fc657f94c6a6..a06ca34a02e7 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -301,6 +301,12 @@ macro_rules! downcast_primitive { $crate::repeat_pat!($crate::cast::__private::DataType::Float64, $($data_type),+) => { $m!($crate::types::Float64Type $(, $args)*) } + $crate::repeat_pat!($crate::cast::__private::DataType::Decimal32(_, _), $($data_type),+) => { + $m!($crate::types::Decimal32Type $(, $args)*) + } + $crate::repeat_pat!($crate::cast::__private::DataType::Decimal64(_, _), $($data_type),+) => { + $m!($crate::types::Decimal64Type $(, $args)*) + } $crate::repeat_pat!($crate::cast::__private::DataType::Decimal128(_, _), $($data_type),+) => { $m!($crate::types::Decimal128Type $(, $args)*) } diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 8958ca6fae62..956b9dd2fcd4 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -64,7 +64,7 @@ pub trait RecordBatchWriter { /// Support for limited data types is available. The macro will return a compile error if an unsupported data type is used. /// Presently supported data types are: /// - `Boolean`, `Null` -/// - `Decimal128`, `Decimal256` +/// - `Decimal32`, `Decimal64`, `Decimal128`, `Decimal256` /// - `Float16`, `Float32`, `Float64` /// - `Int8`, `Int16`, `Int32`, `Int64` /// - `UInt8`, `UInt16`, `UInt32`, `UInt64` @@ -106,6 +106,8 @@ macro_rules! create_array { (@from DurationMillisecond) => { $crate::DurationMillisecondArray }; (@from DurationMicrosecond) => { $crate::DurationMicrosecondArray }; (@from DurationNanosecond) => { $crate::DurationNanosecondArray }; + (@from Decimal32) => { $crate::Decimal32Array }; + (@from Decimal64) => { $crate::Decimal64Array }; (@from Decimal128) => { $crate::Decimal128Array }; (@from Decimal256) => { $crate::Decimal256Array }; (@from TimestampSecond) => { $crate::TimestampSecondArray }; diff --git a/arrow-cast/src/cast/decimal.rs b/arrow-cast/src/cast/decimal.rs index ba82ca9040c7..e9216385a818 100644 --- a/arrow-cast/src/cast/decimal.rs +++ b/arrow-cast/src/cast/decimal.rs @@ -20,14 +20,80 @@ use crate::cast::*; /// A utility trait that provides checked conversions between /// decimal types inspired by [`NumCast`] pub(crate) trait DecimalCast: Sized { + fn to_i32(self) -> Option; + + fn to_i64(self) -> Option; + fn to_i128(self) -> Option; fn to_i256(self) -> Option; fn from_decimal(n: T) -> Option; + + fn from_f64(n: f64) -> Option; +} + +impl DecimalCast for i32 { + fn to_i32(self) -> Option { + Some(self) + } + + fn to_i64(self) -> Option { + Some(self as i64) + } + + fn to_i128(self) -> Option { + Some(self as i128) + } + + fn to_i256(self) -> Option { + Some(i256::from_i128(self as i128)) + } + + fn from_decimal(n: T) -> Option { + n.to_i32() + } + + fn from_f64(n: f64) -> Option { + n.to_i32() + } +} + +impl DecimalCast for i64 { + fn to_i32(self) -> Option { + Some(self as i32) + } + + fn to_i64(self) -> Option { + Some(self) + } + + fn to_i128(self) -> Option { + Some(self as i128) + } + + fn to_i256(self) -> Option { + Some(i256::from_i128(self as i128)) + } + + fn from_decimal(n: T) -> Option { + n.to_i64() + } + + fn from_f64(n: f64) -> Option { + n.to_i64() + } } impl DecimalCast for i128 { + fn to_i32(self) -> Option { + Some(self as i32) + } + + fn to_i64(self) -> Option { + Some(self as i64) + } + fn to_i128(self) -> Option { Some(self) } @@ -39,9 +105,21 @@ impl DecimalCast for i128 { fn from_decimal(n: T) -> Option { n.to_i128() } + + fn from_f64(n: f64) -> Option { + n.to_i128() + } } impl DecimalCast for i256 { + fn to_i32(self) -> Option { + self.to_i128().map(|x| x as i32) + } + + fn to_i64(self) -> Option { + self.to_i128().map(|x| x as i64) + } + fn to_i128(self) -> Option { self.to_i128() } @@ -53,6 +131,10 @@ impl DecimalCast for i256 { fn from_decimal(n: T) -> Option { n.to_i256() } + + fn from_f64(n: f64) -> Option { + i256::from_f64(n) + } } pub(crate) fn cast_decimal_to_decimal_error( @@ -464,52 +546,7 @@ where Ok(Arc::new(result)) } -pub(crate) fn cast_floating_point_to_decimal128( - array: &PrimitiveArray, - precision: u8, - scale: i8, - cast_options: &CastOptions, -) -> Result -where - ::Native: AsPrimitive, -{ - let mul = 10_f64.powi(scale as i32); - - if cast_options.safe { - array - .unary_opt::<_, Decimal128Type>(|v| { - (mul * v.as_()) - .round() - .to_i128() - .filter(|v| Decimal128Type::is_valid_decimal_precision(*v, precision)) - }) - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) - } else { - array - .try_unary::<_, Decimal128Type, _>(|v| { - (mul * v.as_()) - .round() - .to_i128() - .ok_or_else(|| { - ArrowError::CastError(format!( - "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal128Type::PREFIX, - precision, - scale, - v - )) - }) - .and_then(|v| { - Decimal128Type::validate_decimal_precision(v, precision).map(|_| v) - }) - })? - .with_precision_and_scale(precision, scale) - .map(|a| Arc::new(a) as ArrayRef) - } -} - -pub(crate) fn cast_floating_point_to_decimal256( +pub(crate) fn cast_floating_point_to_decimal( array: &PrimitiveArray, precision: u8, scale: i8, @@ -517,32 +554,34 @@ pub(crate) fn cast_floating_point_to_decimal256( ) -> Result where ::Native: AsPrimitive, + D: DecimalType + ArrowPrimitiveType, + M: ArrowNativeTypeOp + DecimalCast, { let mul = 10_f64.powi(scale as i32); if cast_options.safe { array - .unary_opt::<_, Decimal256Type>(|v| { - i256::from_f64((v.as_() * mul).round()) - .filter(|v| Decimal256Type::is_valid_decimal_precision(*v, precision)) + .unary_opt::<_, D>(|v| { + M::from_f64::((mul * v.as_()).round()) + .filter(|v| D::is_valid_decimal_precision(*v, precision)) }) .with_precision_and_scale(precision, scale) .map(|a| Arc::new(a) as ArrayRef) } else { array - .try_unary::<_, Decimal256Type, _>(|v| { - i256::from_f64((v.as_() * mul).round()) + .try_unary::<_, D, _>(|v| { + M::from_f64::((mul * v.as_()).round()) .ok_or_else(|| { ArrowError::CastError(format!( "Cannot cast to {}({}, {}). Overflowing on {:?}", - Decimal256Type::PREFIX, + D::PREFIX, precision, scale, v )) }) .and_then(|v| { - Decimal256Type::validate_decimal_precision(v, precision).map(|_| v) + D::validate_decimal_precision(v, precision).map(|_| v) }) })? .with_precision_and_scale(precision, scale) diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs index ec0ab346f997..4ea514375b61 100644 --- a/arrow-cast/src/cast/dictionary.rs +++ b/arrow-cast/src/cast/dictionary.rs @@ -214,49 +214,37 @@ pub(crate) fn cast_to_dictionary( UInt16 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), UInt32 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), UInt64 => pack_numeric_to_dictionary::(array, dict_value_type, cast_options), + Decimal32(p, s) => { + pack_decimal_to_dictionary::( + array, + p, + s, + cast_options + ) + } + Decimal64(p, s) => { + pack_decimal_to_dictionary::( + array, + p, + s, + cast_options + ) + } Decimal128(p, s) => { - let dict = pack_numeric_to_dictionary::( + pack_decimal_to_dictionary::( array, - dict_value_type, - cast_options, - )?; - let dict = dict - .as_dictionary::() - .downcast_dict::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dict to Decimal128Array".to_string(), - ) - })?; - let value = dict.values().clone(); - // Set correct precision/scale - let value = value.with_precision_and_scale(p, s)?; - Ok(Arc::new(DictionaryArray::::try_new( - dict.keys().clone(), - Arc::new(value), - )?)) + p, + s, + cast_options + ) } Decimal256(p, s) => { - let dict = pack_numeric_to_dictionary::( + pack_decimal_to_dictionary::( array, - dict_value_type, - cast_options, - )?; - let dict = dict - .as_dictionary::() - .downcast_dict::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dict to Decimal256Array".to_string(), - ) - })?; - let value = dict.values().clone(); - // Set correct precision/scale - let value = value.with_precision_and_scale(p, s)?; - Ok(Arc::new(DictionaryArray::::try_new( - dict.keys().clone(), - Arc::new(value), - )?)) + p, + s, + cast_options + ) } Float16 => { pack_numeric_to_dictionary::(array, dict_value_type, cast_options) @@ -359,6 +347,40 @@ where Ok(Arc::new(b.finish())) } +pub(crate) fn pack_decimal_to_dictionary( + array: &dyn Array, + precision: u8, + scale: i8, + cast_options: &CastOptions, +) -> Result +where + K: ArrowDictionaryKeyType, + D: DecimalType + ArrowPrimitiveType, + M: ArrowNativeTypeOp + DecimalCast, +{ + let dict = pack_numeric_to_dictionary::( + array, + &D::DATA_TYPE, + cast_options, + )?; + let dict = dict + .as_dictionary::() + .downcast_dict::>() + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "Internal Error: Cannot cast dict to {}", + D::PREFIX + )) + })?; + let value = dict.values().clone(); + // Set correct precision/scale + let value = value.with_precision_and_scale(precision, scale)?; + Ok(Arc::new(DictionaryArray::::try_new( + dict.keys().clone(), + Arc::new(value), + )?)) +} + pub(crate) fn string_view_to_dictionary( array: &dyn Array, ) -> Result diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 7c14126f36b6..391ffce90cbe 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -830,6 +830,71 @@ pub fn cast_with_options( (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => { cast_map_values(array.as_map(), to_type, cast_options, ordered1.to_owned()) } + (Decimal32(p1, s1), Decimal32(p2, s2)) => { + cast_decimal_to_decimal_same_type::( + array.as_primitive(), + *p1, + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal32(_, s1), Decimal64(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal32(_, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal32(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal64(p1, s1), Decimal64(p2, s2)) => { + cast_decimal_to_decimal_same_type::( + array.as_primitive(), + *p1, + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal64(_, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal64(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } (Decimal128(p1, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), @@ -868,315 +933,93 @@ pub fn cast_with_options( cast_options, ) } + (Decimal32(_, scale), _) if !to_type.is_temporal() => { + cast_from_decimal::( + array, + 10_i32, + scale, + from_type, + to_type, + |x: i32| x as f64, + cast_options, + ) + } + (Decimal64(_, scale), _) if !to_type.is_temporal() => { + cast_from_decimal::( + array, + 10_i64, + scale, + from_type, + to_type, + |x: i64| x as f64, + cast_options, + ) + } (Decimal128(_, scale), _) if !to_type.is_temporal() => { - // cast decimal to other type - match to_type { - UInt8 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - UInt16 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - UInt32 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - UInt64 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - Int8 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - Int16 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - Int32 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - Int64 => cast_decimal_to_integer::( - array, - 10_i128, - *scale, - cast_options, - ), - Float32 => cast_decimal_to_float::(array, |x| { - (x as f64 / 10_f64.powi(*scale as i32)) as f32 - }), - Float64 => cast_decimal_to_float::(array, |x| { - x as f64 / 10_f64.powi(*scale as i32) - }), - Utf8View => value_to_string_view(array, cast_options), - Utf8 => value_to_string::(array, cast_options), - LargeUtf8 => value_to_string::(array, cast_options), - Null => Ok(new_null_array(to_type, array.len())), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" - ))), - } + cast_from_decimal::( + array, + 10_i128, + scale, + from_type, + to_type, + |x: i128| x as f64, + cast_options, + ) } (Decimal256(_, scale), _) if !to_type.is_temporal() => { - // cast decimal to other type - match to_type { - UInt8 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - UInt16 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - UInt32 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - UInt64 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - Int8 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - Int16 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - Int32 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - Int64 => cast_decimal_to_integer::( - array, - i256::from_i128(10_i128), - *scale, - cast_options, - ), - Float32 => cast_decimal_to_float::(array, |x| { - (x.to_f64().unwrap() / 10_f64.powi(*scale as i32)) as f32 - }), - Float64 => cast_decimal_to_float::(array, |x| { - x.to_f64().unwrap() / 10_f64.powi(*scale as i32) - }), - Utf8View => value_to_string_view(array, cast_options), - Utf8 => value_to_string::(array, cast_options), - LargeUtf8 => value_to_string::(array, cast_options), - Null => Ok(new_null_array(to_type, array.len())), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" - ))), - } + cast_from_decimal::( + array, + i256::from_i128(10_i128), + scale, + from_type, + to_type, + |x: i256| x.to_f64().unwrap(), + cast_options, + ) + } + (_, Decimal32(precision, scale)) if !from_type.is_temporal() => { + cast_to_decimal::( + array, + 10_i32, + precision, + scale, + from_type, + to_type, + cast_options, + ) + } + (_, Decimal64(precision, scale)) if !from_type.is_temporal() => { + cast_to_decimal::( + array, + 10_i64, + precision, + scale, + from_type, + to_type, + cast_options, + ) } (_, Decimal128(precision, scale)) if !from_type.is_temporal() => { - // cast data to decimal - match from_type { - UInt8 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - UInt16 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - UInt32 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - UInt64 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - Int8 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - Int16 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - Int32 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - Int64 => cast_integer_to_decimal::<_, Decimal128Type, _>( - array.as_primitive::(), - *precision, - *scale, - 10_i128, - cast_options, - ), - Float32 => cast_floating_point_to_decimal128( - array.as_primitive::(), - *precision, - *scale, - cast_options, - ), - Float64 => cast_floating_point_to_decimal128( - array.as_primitive::(), - *precision, - *scale, - cast_options, - ), - Utf8View | Utf8 => cast_string_to_decimal::( - array, - *precision, - *scale, - cast_options, - ), - LargeUtf8 => cast_string_to_decimal::( - array, - *precision, - *scale, - cast_options, - ), - Null => Ok(new_null_array(to_type, array.len())), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" - ))), - } + cast_to_decimal::( + array, + 10_i128, + precision, + scale, + from_type, + to_type, + cast_options, + ) } (_, Decimal256(precision, scale)) if !from_type.is_temporal() => { - // cast data to decimal - match from_type { - UInt8 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - UInt16 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - UInt32 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - UInt64 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - Int8 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - Int16 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - Int32 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - Int64 => cast_integer_to_decimal::<_, Decimal256Type, _>( - array.as_primitive::(), - *precision, - *scale, - i256::from_i128(10_i128), - cast_options, - ), - Float32 => cast_floating_point_to_decimal256( - array.as_primitive::(), - *precision, - *scale, - cast_options, - ), - Float64 => cast_floating_point_to_decimal256( - array.as_primitive::(), - *precision, - *scale, - cast_options, - ), - Utf8View | Utf8 => cast_string_to_decimal::( - array, - *precision, - *scale, - cast_options, - ), - LargeUtf8 => cast_string_to_decimal::( - array, - *precision, - *scale, - cast_options, - ), - Null => Ok(new_null_array(to_type, array.len())), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" - ))), - } + cast_to_decimal::( + array, + i256::from_i128(10_i128), + precision, + scale, + from_type, + to_type, + cast_options, + ) } (Struct(_), Struct(to_fields)) => { let array = array.as_struct(); @@ -2192,6 +2035,198 @@ pub fn cast_with_options( } } +fn cast_from_decimal( + array: &dyn Array, + base: D::Native, + scale: &i8, + from_type: &DataType, + to_type: &DataType, + as_float: F, + cast_options: &CastOptions, +) -> Result +where + D: DecimalType + ArrowPrimitiveType, + ::Native: ArrowNativeTypeOp + ToPrimitive, + F: Fn(D::Native) -> f64, +{ + use DataType::*; + // cast decimal to other type + match to_type { + UInt8 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + UInt16 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + UInt32 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + UInt64 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + Int8 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + Int16 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + Int32 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + Int64 => cast_decimal_to_integer::( + array, + base, + *scale, + cast_options, + ), + Float32 => cast_decimal_to_float::(array, |x| { + (as_float(x) / 10_f64.powi(*scale as i32)) as f32 + }), + Float64 => cast_decimal_to_float::(array, |x| { + as_float(x) / 10_f64.powi(*scale as i32) + }), + Utf8View => value_to_string_view(array, cast_options), + Utf8 => value_to_string::(array, cast_options), + LargeUtf8 => value_to_string::(array, cast_options), + Null => Ok(new_null_array(to_type, array.len())), + _ => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), + } +} + +fn cast_to_decimal( + array: &dyn Array, + base: M, + precision: &u8, + scale: &i8, + from_type: &DataType, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result +where + D: DecimalType + ArrowPrimitiveType, + M: ArrowNativeTypeOp + DecimalCast, + u8: num::traits::AsPrimitive, + u16: num::traits::AsPrimitive, + u32: num::traits::AsPrimitive, + u64: num::traits::AsPrimitive, + i8: num::traits::AsPrimitive, + i16: num::traits::AsPrimitive, + i32: num::traits::AsPrimitive, + i64: num::traits::AsPrimitive, +{ + use DataType::*; + // cast data to decimal + match from_type { + UInt8 => cast_integer_to_decimal::<_, D, M>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + UInt16 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + UInt32 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + UInt64 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + Int8 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + Int16 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + Int32 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + Int64 => cast_integer_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + base, + cast_options, + ), + Float32 => cast_floating_point_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + cast_options, + ), + Float64 => cast_floating_point_to_decimal::<_, D, _>( + array.as_primitive::(), + *precision, + *scale, + cast_options, + ), + Utf8View | Utf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), + LargeUtf8 => cast_string_to_decimal::( + array, + *precision, + *scale, + cast_options, + ), + Null => Ok(new_null_array(to_type, array.len())), + _ => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), + } +} + /// Get the time unit as a multiple of a second const fn time_unit_multiple(unit: &TimeUnit) -> i64 { match unit { @@ -2527,6 +2562,28 @@ mod tests { }; } + fn create_decimal32_array( + array: Vec>, + precision: u8, + scale: i8, + ) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + } + + fn create_decimal64_array( + array: Vec>, + precision: u8, + scale: i8, + ) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + } + fn create_decimal_array( array: Vec>, precision: u8, @@ -2675,6 +2732,72 @@ mod tests { ); } + #[test] + fn test_cast_decimal32_to_decimal32() { + let input_type = DataType::Decimal32(9, 3); + let output_type = DataType::Decimal32(9, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal32_array(array, 9, 3).unwrap(); + generate_cast_test_case!( + &array, + Decimal32Array, + &output_type, + vec![ + Some(11234560_i32), + Some(21234560_i32), + Some(31234560_i32), + None + ] + ); + // negative test + let array = vec![Some(123456), None]; + let array = create_decimal32_array(array, 9, 0).unwrap(); + let result_safe = cast(&array, &DataType::Decimal32(2, 2)); + assert!(result_safe.is_ok()); + let options = CastOptions { + safe: false, + ..Default::default() + }; + + let result_unsafe = cast_with_options(&array, &DataType::Decimal32(2, 2), &options); + assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal32 of precision 2. Max is 99", + result_unsafe.unwrap_err().to_string()); + } + + #[test] + fn test_cast_decimal64_to_decimal64() { + let input_type = DataType::Decimal64(17, 3); + let output_type = DataType::Decimal64(17, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal64_array(array, 17, 3).unwrap(); + generate_cast_test_case!( + &array, + Decimal64Array, + &output_type, + vec![ + Some(11234560_i64), + Some(21234560_i64), + Some(31234560_i64), + None + ] + ); + // negative test + let array = vec![Some(123456), None]; + let array = create_decimal64_array(array, 9, 0).unwrap(); + let result_safe = cast(&array, &DataType::Decimal64(2, 2)); + assert!(result_safe.is_ok()); + let options = CastOptions { + safe: false, + ..Default::default() + }; + + let result_unsafe = cast_with_options(&array, &DataType::Decimal64(2, 2), &options); + assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal64 of precision 2. Max is 99", + result_unsafe.unwrap_err().to_string()); + } + #[test] fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); @@ -2708,6 +2831,38 @@ mod tests { result_unsafe.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal32_to_decimal32_dict() { + let p = 9; + let s = 3; + let input_type = DataType::Decimal32(p, s); + let output_type = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal32(p, s)), + ); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal32_array(array, p, s).unwrap(); + let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); + assert_eq!(cast_array.data_type(), &output_type); + } + + #[test] + fn test_cast_decimal64_to_decimal64_dict() { + let p = 15; + let s = 3; + let input_type = DataType::Decimal64(p, s); + let output_type = DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Decimal64(p, s)), + ); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal64_array(array, p, s).unwrap(); + let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); + assert_eq!(cast_array.data_type(), &output_type); + } + #[test] fn test_cast_decimal128_to_decimal128_dict() { let p = 20; @@ -2740,6 +2895,46 @@ mod tests { assert_eq!(cast_array.data_type(), &output_type); } + #[test] + fn test_cast_decimal32_to_decimal32_overflow() { + let input_type = DataType::Decimal32(9, 3); + let output_type = DataType::Decimal32(9, 9); + assert!(can_cast_types(&input_type, &output_type)); + + let array = vec![Some(i32::MAX)]; + let array = create_decimal32_array(array, 9, 3).unwrap(); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Cast error: Cannot cast to Decimal32(9, 9). Overflowing on 2147483647", + result.unwrap_err().to_string()); + } + + #[test] + fn test_cast_decimal64_to_decimal64_overflow() { + let input_type = DataType::Decimal64(18, 3); + let output_type = DataType::Decimal64(18, 18); + assert!(can_cast_types(&input_type, &output_type)); + + let array = vec![Some(i64::MAX)]; + let array = create_decimal64_array(array, 18, 3).unwrap(); + let result = cast_with_options( + &array, + &output_type, + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ); + assert_eq!("Cast error: Cannot cast to Decimal64(18, 18). Overflowing on 9223372036854775807", + result.unwrap_err().to_string()); + } + #[test] fn test_cast_decimal128_to_decimal128_overflow() { let input_type = DataType::Decimal128(38, 3); @@ -2780,6 +2975,44 @@ mod tests { result.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal32_to_decimal256() { + let input_type = DataType::Decimal32(8, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal32_array(array, 8, 3).unwrap(); + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some(i256::from_i128(11234560_i128)), + Some(i256::from_i128(21234560_i128)), + Some(i256::from_i128(31234560_i128)), + None + ] + ); + } + #[test] + fn test_cast_decimal64_to_decimal256() { + let input_type = DataType::Decimal64(12, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let array = create_decimal64_array(array, 12, 3).unwrap(); + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some(i256::from_i128(11234560_i128)), + Some(i256::from_i128(21234560_i128)), + Some(i256::from_i128(31234560_i128)), + None + ] + ); + } #[test] fn test_cast_decimal128_to_decimal256() { let input_type = DataType::Decimal128(20, 3); @@ -2888,69 +3121,67 @@ mod tests { ); } - #[test] - fn test_cast_decimal_to_numeric() { - let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + macro_rules! generate_decimal_to_numeric_cast_test_case { + ($INPUT_ARRAY: expr) => { // u8 generate_cast_test_case!( - &array, + $INPUT_ARRAY, UInt8Array, &DataType::UInt8, vec![Some(1_u8), Some(2_u8), Some(3_u8), None, Some(5_u8)] ); // u16 generate_cast_test_case!( - &array, + $INPUT_ARRAY, UInt16Array, &DataType::UInt16, vec![Some(1_u16), Some(2_u16), Some(3_u16), None, Some(5_u16)] ); // u32 generate_cast_test_case!( - &array, + $INPUT_ARRAY, UInt32Array, &DataType::UInt32, vec![Some(1_u32), Some(2_u32), Some(3_u32), None, Some(5_u32)] ); // u64 generate_cast_test_case!( - &array, + $INPUT_ARRAY, UInt64Array, &DataType::UInt64, vec![Some(1_u64), Some(2_u64), Some(3_u64), None, Some(5_u64)] ); // i8 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Int8Array, &DataType::Int8, vec![Some(1_i8), Some(2_i8), Some(3_i8), None, Some(5_i8)] ); // i16 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Int16Array, &DataType::Int16, vec![Some(1_i16), Some(2_i16), Some(3_i16), None, Some(5_i16)] ); // i32 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Int32Array, &DataType::Int32, vec![Some(1_i32), Some(2_i32), Some(3_i32), None, Some(5_i32)] ); // i64 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Int64Array, &DataType::Int64, vec![Some(1_i64), Some(2_i64), Some(3_i64), None, Some(5_i64)] ); // f32 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Float32Array, &DataType::Float32, vec![ @@ -2963,7 +3194,7 @@ mod tests { ); // f64 generate_cast_test_case!( - &array, + $INPUT_ARRAY, Float64Array, &DataType::Float64, vec![ @@ -2974,6 +3205,31 @@ mod tests { Some(5.25_f64) ] ); + } + } + + #[test] + fn test_cast_decimal32_to_numeric() { + let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; + let array = create_decimal32_array(value_array, 8, 2).unwrap(); + + generate_decimal_to_numeric_cast_test_case!(&array); + } + + #[test] + fn test_cast_decimal64_to_numeric() { + let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; + let array = create_decimal64_array(value_array, 8, 2).unwrap(); + + generate_decimal_to_numeric_cast_test_case!(&array); + } + + #[test] + fn test_cast_decimal_to_numeric() { + let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; + let array = create_decimal_array(value_array, 38, 2).unwrap(); + + generate_decimal_to_numeric_cast_test_case!(&array); // overflow test: out of range of max u8 let value_array: Vec> = vec![Some(51300)]; @@ -9226,6 +9482,14 @@ mod tests { #[test] fn test_cast_decimal_to_string() { + assert!(can_cast_types( + &DataType::Decimal32(9, 4), + &DataType::Utf8View + )); + assert!(can_cast_types( + &DataType::Decimal64(16, 4), + &DataType::Utf8View + )); assert!(can_cast_types( &DataType::Decimal128(10, 4), &DataType::Utf8View @@ -9270,7 +9534,7 @@ mod tests { } } - let array128: Vec> = vec![ + let array32: Vec> = vec![ Some(1123454), Some(2123456), Some(-3123453), @@ -9281,11 +9545,45 @@ mod tests { Some(-123456789), None, ]; + let array64: Vec> = array32 + .iter() + .map(|num| num.map(|x| x as i64)) + .collect(); + let array128: Vec> = array64 + .iter() + .map(|num| num.map(|x| x as i128)) + .collect(); let array256: Vec> = array128 .iter() .map(|num| num.map(i256::from_i128)) .collect(); + test_decimal_to_string::( + DataType::Utf8View, + create_decimal32_array(array32.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::Utf8, + create_decimal32_array(array32.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::LargeUtf8, + create_decimal32_array(array32, 7, 3).unwrap(), + ); + + test_decimal_to_string::( + DataType::Utf8View, + create_decimal64_array(array64.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::Utf8, + create_decimal64_array(array64.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( + DataType::LargeUtf8, + create_decimal64_array(array64, 7, 3).unwrap(), + ); + test_decimal_to_string::( DataType::Utf8View, create_decimal_array(array128.clone(), 7, 3).unwrap(), diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 3a99e651bf3b..a8ce3fcc5b1a 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -1325,6 +1325,54 @@ mod tests { assert_eq!("0.290472", lng.value_as_string(9)); } + #[test] + fn test_csv_reader_with_decimal_3264() { + let schema = Arc::new(Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Decimal32(9, 6), false), + Field::new("lng", DataType::Decimal64(16, 6), false), + ])); + + let file = File::open("test/data/decimal_test.csv").unwrap(); + + let mut csv = ReaderBuilder::new(schema).build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("57.653484", lat.value_as_string(0)); + assert_eq!("53.002666", lat.value_as_string(1)); + assert_eq!("52.412811", lat.value_as_string(2)); + assert_eq!("51.481583", lat.value_as_string(3)); + assert_eq!("12.123456", lat.value_as_string(4)); + assert_eq!("50.760000", lat.value_as_string(5)); + assert_eq!("0.123000", lat.value_as_string(6)); + assert_eq!("123.000000", lat.value_as_string(7)); + assert_eq!("123.000000", lat.value_as_string(8)); + assert_eq!("-50.760000", lat.value_as_string(9)); + + let lng = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("-3.335724", lng.value_as_string(0)); + assert_eq!("-2.179404", lng.value_as_string(1)); + assert_eq!("-1.778197", lng.value_as_string(2)); + assert_eq!("-3.179090", lng.value_as_string(3)); + assert_eq!("-3.179090", lng.value_as_string(4)); + assert_eq!("0.290472", lng.value_as_string(5)); + assert_eq!("0.290472", lng.value_as_string(6)); + assert_eq!("0.290472", lng.value_as_string(7)); + assert_eq!("0.290472", lng.value_as_string(8)); + assert_eq!("0.290472", lng.value_as_string(9)); + } + #[test] fn test_csv_from_buf_reader() { let schema = Schema::new(vec![ diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index c5a0a0b76d59..211a107e2a1e 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -418,8 +418,8 @@ mod tests { use crate::ReaderBuilder; use arrow_array::builder::{ - BinaryBuilder, Decimal128Builder, Decimal256Builder, FixedSizeBinaryBuilder, - LargeBinaryBuilder, + BinaryBuilder, Decimal32Builder, Decimal64Builder, Decimal128Builder, Decimal256Builder, + FixedSizeBinaryBuilder, LargeBinaryBuilder, }; use arrow_array::types::*; use arrow_buffer::i256; @@ -496,25 +496,36 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo #[test] fn test_write_csv_decimal() { let schema = Schema::new(vec![ - Field::new("c1", DataType::Decimal128(38, 6), true), - Field::new("c2", DataType::Decimal256(76, 6), true), + Field::new("c1", DataType::Decimal32(9, 6), true), + Field::new("c2", DataType::Decimal64(17, 6), true), + Field::new("c3", DataType::Decimal128(38, 6), true), + Field::new("c4", DataType::Decimal256(76, 6), true), ]); - let mut c1_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); + let mut c1_builder = Decimal32Builder::new().with_data_type(DataType::Decimal32(9, 6)); c1_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); let c1 = c1_builder.finish(); - let mut c2_builder = Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); - c2_builder.extend(vec![ + let mut c2_builder = Decimal64Builder::new().with_data_type(DataType::Decimal64(17, 6)); + c2_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); + let c2 = c2_builder.finish(); + + let mut c3_builder = Decimal128Builder::new().with_data_type(DataType::Decimal128(38, 6)); + c3_builder.extend(vec![Some(-3335724), Some(2179404), None, Some(290472)]); + let c3 = c3_builder.finish(); + + let mut c4_builder = Decimal256Builder::new().with_data_type(DataType::Decimal256(76, 6)); + c4_builder.extend(vec![ Some(i256::from_i128(-3335724)), Some(i256::from_i128(2179404)), None, Some(i256::from_i128(290472)), ]); - let c2 = c2_builder.finish(); + let c4 = c4_builder.finish(); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![ + Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4) + ]).unwrap(); let mut file = tempfile::tempfile().unwrap(); @@ -530,15 +541,15 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let mut buffer: Vec = vec![]; file.read_to_end(&mut buffer).unwrap(); - let expected = r#"c1,c2 --3.335724,-3.335724 -2.179404,2.179404 -, -0.290472,0.290472 --3.335724,-3.335724 -2.179404,2.179404 -, -0.290472,0.290472 + let expected = r#"c1,c2,c3,c4 +-3.335724,-3.335724,-3.335724,-3.335724 +2.179404,2.179404,2.179404,2.179404 +,,, +0.290472,0.290472,0.290472,0.290472 +-3.335724,-3.335724,-3.335724,-3.335724 +2.179404,2.179404,2.179404,2.179404 +,,, +0.290472,0.290472,0.290472,0.290472 "#; assert_eq!(expected, str::from_utf8(&buffer).unwrap()); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 4c21d9a7632a..6f016d213675 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -83,6 +83,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff | DataType::Float16 | DataType::Float32 | DataType::Float64 + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) | DataType::Date32 | DataType::Time32(_) | DataType::Date64 @@ -139,10 +143,6 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { [empty_buffer, MutableBuffer::new(0)] } - DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], DataType::Union(_, mode) => { let type_ids = MutableBuffer::new(capacity * mem::size_of::()); match mode { diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs index ed430fe6a1ec..d4c11e07ecd4 100644 --- a/arrow-json/src/writer/encoder.rs +++ b/arrow-json/src/writer/encoder.rs @@ -138,7 +138,7 @@ fn make_encoder_impl<'a>( }; (Box::new(encoder) as _, array.nulls().cloned()) } - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { + DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { let options = FormatOptions::new().with_display_error(true); let formatter = ArrayFormatter::try_new(array, &options)?; (Box::new(RawArrayFormatter(formatter)) as _, array.nulls().cloned()) diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index da929ae1da74..e95e342122f0 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -283,6 +283,9 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal32 to decimal32 512", |b| { b.iter(|| cast_array(&decimal32_array, DataType::Decimal32(8, 2))) }); + c.bench_function("cast decimal64 to decimal64 512", |b| { + b.iter(|| cast_array(&decimal64_array, DataType::Decimal64(16, 5))) + }); c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index 6378cd991e2a..f04b6236a70b 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -67,7 +67,7 @@ pub fn make_fixed_len_byte_array_reader( ArrowType::Decimal32(_, _) => { if byte_length > 4 { return Err(general_err!( - "decimal 64 type too large, must be less then 4 bytes, got {}", + "decimal 32 type too large, must be less then 4 bytes, got {}", byte_length )); } @@ -75,7 +75,7 @@ pub fn make_fixed_len_byte_array_reader( ArrowType::Decimal64(_, _) => { if byte_length > 8 { return Err(general_err!( - "decimal 32 type too large, must be less then 8 bytes, got {}", + "decimal 64 type too large, must be less then 8 bytes, got {}", byte_length )); } From 20ec84eb1bc58b1e155c781e98623e46d0aee242 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 04:52:44 -0500 Subject: [PATCH 05/68] Add Field::with_dict_is_ordered (#6885) --- arrow-schema/src/field.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 7fb88d48aedb..e5ea92b689fa 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -426,6 +426,19 @@ impl Field { } /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type. + /// + /// # Example + /// ``` + /// # use arrow_schema::{DataType, Field}; + /// // non dictionaries do not have a dict is ordered flat + /// let field = Field::new("c1", DataType::Int64, false); + /// assert_eq!(field.dict_is_ordered(), None); + /// // by default dictionary is not ordered + /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false); + /// assert_eq!(field.dict_is_ordered(), Some(false)); + /// let field = field.with_dict_is_ordered(true); + /// assert_eq!(field.dict_is_ordered(), Some(true)); + /// ``` #[inline] pub const fn dict_is_ordered(&self) -> Option { match self.data_type { @@ -434,6 +447,18 @@ impl Field { } } + /// Set the is ordered field for this `Field`, if it is a dictionary. + /// + /// Does nothing if this is not a dictionary type. + /// + /// See [`Field::dict_is_ordered`] for more information. + pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self { + if matches!(self.data_type, DataType::Dictionary(_, _)) { + self.dict_is_ordered = dict_is_ordered; + }; + self + } + /// Merge this field into self if it is compatible. /// /// Struct fields are merged recursively. From 9daab3392f738ae39ee97a77bfdbcd0cc302813e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 05:52:15 -0500 Subject: [PATCH 06/68] Minor: make it easier to find fix instructions when `cargo fmt` on parquet fails (#6886) * Minor: make it easier to find instructions when fmt fails * purposely introduce a fmt issue * Revert "purposely introduce a fmt issue" This reverts commit 440e52079135df85128b15936425d2b5af488007. * Update .github/workflows/rust.yml Co-authored-by: Ed Seidl --------- Co-authored-by: Ed Seidl --- .github/workflows/rust.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ff5040fd2947..72a53263d330 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -101,12 +101,13 @@ jobs: - name: Format arrow run: cargo fmt --all -- --check - name: Format parquet - # Many modules in parquet are skipped, so check parquet separately. If this check fails, run: - # cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs` - # from the top level arrow-rs directory and check in the result. + # Many modules in parquet are skipped, so check parquet separately # https://github.com/apache/arrow-rs/issues/6179 working-directory: parquet - run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` + run: | + # if this fails, run this from the parquet directory: + # cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs` + cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` - name: Format object_store working-directory: object_store run: cargo fmt --all -- --check From 9a4ccd1a00f4da2f9f0262a0a4453c6dd485da1e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 05:52:40 -0500 Subject: [PATCH 07/68] Minor: add comments explaining bad MSRV, output in json (#6857) * Minor: add comments explaining bad MSRV * purposely introduce msrv brek * output in JSON format * Revert "purposely introduce msrv brek" This reverts commit 61872b69a5a85748031fe852e48b8e3d3381d270. --- .github/workflows/rust.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 72a53263d330..044250b70435 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -113,7 +113,7 @@ jobs: run: cargo fmt --all -- --check msrv: - name: Verify MSRV + name: Verify MSRV (Minimum Supported Rust Version) runs-on: ubuntu-latest container: image: amd64/rust @@ -127,13 +127,19 @@ jobs: run: cargo update -p ahash --precise 0.8.7 - name: Check arrow working-directory: arrow - run: cargo msrv --log-target stdout verify + run: | + # run `cd arrow; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Check parquet working-directory: parquet - run: cargo msrv --log-target stdout verify + run: | + # run `cd parquet; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Check arrow-flight working-directory: arrow-flight - run: cargo msrv --log-target stdout verify + run: | + # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Downgrade object_store dependencies working-directory: object_store # Necessary because tokio 1.30.0 updates MSRV to 1.63 @@ -143,4 +149,6 @@ jobs: cargo update -p url --precise 2.5.0 - name: Check object_store working-directory: object_store - run: cargo msrv --log-target stdout verify + run: | + # run `cd object_store; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json From 77e92b209b583fa607645d3fbb5ae8d7b1068a70 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 07:25:59 -0500 Subject: [PATCH 08/68] Add 53.4.0 to release schedule (#6896) * Add 54.4.0 to release schedule * prettoer --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 57794b1d6a46..f995ff6ad478 100644 --- a/README.md +++ b/README.md @@ -63,13 +63,14 @@ is described in the [contributing] guide. Planned Release Schedule -| Approximate Date | Version | Notes | -| ---------------- | -------- | --------------------------------------- | -| Nov 2024 | `53.3.0` | Minor, NO breaking API changes | -| Dec 2024 | `54.0.0` | Major, potentially breaking API changes | -| Jan 2025 | `54.1.0` | Minor, NO breaking API changes | -| Feb 2025 | `54.2.0` | Minor, NO breaking API changes | -| Mar 2025 | `55.0.0` | Major, potentially breaking API changes | +| Approximate Date | Version | Notes | +| ---------------- | -------- | ------------------------------------------ | +| Nov 2024 | `53.3.0` | Minor, NO breaking API changes | +| Dec 2024 | `54.0.0` | Major, potentially breaking API changes | +| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) | +| Jan 2025 | `54.1.0` | Minor, NO breaking API changes | +| Feb 2025 | `54.2.0` | Minor, NO breaking API changes | +| Mar 2025 | `55.0.0` | Major, potentially breaking API changes | [this ticket]: https://github.com/apache/arrow-rs/issues/5368 [semantic versioning]: https://semver.org/ From 63f5d5e5fca38f209387891609149dd0f61680f1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 07:49:51 -0500 Subject: [PATCH 09/68] Add deprecation / API removal policy (#6852) * Add deprecation / API removal policy * Increase proposal to 2 releases * change from policy to guidelines, add flexibility * prettier * Make instructions more actionable --- README.md | 27 +++++++++++++++++++++++++++ arrow/README.md | 2 +- parquet/README.md | 2 +- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f995ff6ad478..723249ad29e5 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,33 @@ versions approximately every 2 months. [`object_store`]: https://crates.io/crates/object_store +### Deprecation Guidelines + +Minor releases may deprecate, but not remove APIs. Deprecating APIs allows +downstream Rust programs to still compile, but generate compiler warnings. This +gives downstream crates time to migrate prior to API removal. + +To deprecate an API: + +- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated +- Concisely describe the preferred API to help the user transition + +The deprecated version is the next version which will be released (please +consult the list above). To mark the API as deprecated, use the +`#[deprecated(since = "...", note = "...")]` attribute. + +Foe example + +```rust +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] +``` + +In general, deprecated APIs will remain in the codebase for at least two major releases after +they were deprecated (typically between 6 - 9 months later). For example, an API +deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs +may be removed earlier or later than these guidelines at the discretion of the +maintainers. + ## Related Projects There are several related crates in different repositories diff --git a/arrow/README.md b/arrow/README.md index a1444005ec00..79aefaae9053 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The `arrow` crate follows the [SemVer standard] defined by Cargo and works well within the Rust crate ecosystem. See the [repository README] for more details on -the release schedule and version. +the release schedule, version and deprecation policy. [SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html [repository README]: https://github.com/apache/arrow-rs diff --git a/parquet/README.md b/parquet/README.md index e9f52ff279d5..9ff1d921d692 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The `parquet` crate follows the [SemVer standard] defined by Cargo and works well within the Rust crate ecosystem. See the [repository README] for more details on -the release schedule and version. +the release schedule, version and deprecation policy. [semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html [repository readme]: https://github.com/apache/arrow-rs From b8cc13e9143cc7c005a739b26da642c5d356736f Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 18 Dec 2024 05:57:53 -0800 Subject: [PATCH 10/68] Enable string-based column projections from Parquet files (#6871) * add function to create ProjectionMask from column names * add some more tests --- parquet/src/arrow/arrow_reader/mod.rs | 68 ++++++++++ parquet/src/arrow/mod.rs | 178 +++++++++++++++++++++++++- parquet/src/arrow/schema/mod.rs | 11 ++ 3 files changed, 256 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 378884a1c430..6eba04c86f91 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -989,6 +989,21 @@ mod tests { assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]); } + #[test] + fn test_arrow_reader_single_column_by_name() { + let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet"); + + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let original_schema = Arc::clone(builder.schema()); + + let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]); + let reader = builder.with_projection(mask).build().unwrap(); + + // Verify that the schema was correctly parsed + assert_eq!(1, reader.schema().fields().len()); + assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]); + } + #[test] fn test_null_column_reader_test() { let mut file = tempfile::tempfile().unwrap(); @@ -2563,6 +2578,59 @@ mod tests { } } + #[test] + // same as test_read_structs but constructs projection mask via column names + fn test_read_structs_by_name() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/nested_structs.rust.parquet"); + let file = File::open(&path).unwrap(); + let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); + + for batch in record_batch_reader { + batch.unwrap(); + } + + let file = File::open(&path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + + let mask = ProjectionMask::columns( + builder.parquet_schema(), + ["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"], + ); + let projected_reader = builder + .with_projection(mask) + .with_batch_size(60) + .build() + .unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new( + "roll_num", + ArrowDataType::Struct(Fields::from(vec![Field::new( + "count", + ArrowDataType::UInt64, + false, + )])), + false, + ), + Field::new( + "PC_CUR", + ArrowDataType::Struct(Fields::from(vec![ + Field::new("mean", ArrowDataType::Int64, false), + Field::new("sum", ArrowDataType::Int64, false), + ])), + false, + ), + ]); + + assert_eq!(&expected_schema, projected_reader.schema().as_ref()); + + for batch in projected_reader { + let batch = batch.unwrap(); + assert_eq!(batch.schema().as_ref(), &expected_schema); + } + } + #[test] fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index d77436bc1ff7..6777e00fb05c 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -108,12 +108,14 @@ pub mod async_writer; mod record_reader; experimental!(mod schema); +use std::sync::Arc; + pub use self::arrow_writer::ArrowWriter; #[cfg(feature = "async")] pub use self::async_reader::ParquetRecordBatchStreamBuilder; #[cfg(feature = "async")] pub use self::async_writer::AsyncArrowWriter; -use crate::schema::types::SchemaDescriptor; +use crate::schema::types::{SchemaDescriptor, Type}; use arrow_schema::{FieldRef, Schema}; // continue to export deprecated methods until they are removed @@ -210,6 +212,71 @@ impl ProjectionMask { Self { mask: Some(mask) } } + // Given a starting point in the schema, do a DFS for that node adding leaf paths to `paths`. + fn find_leaves(root: &Arc, parent: Option<&String>, paths: &mut Vec) { + let path = parent + .map(|p| [p, root.name()].join(".")) + .unwrap_or(root.name().to_string()); + if root.is_group() { + for child in root.get_fields() { + Self::find_leaves(child, Some(&path), paths); + } + } else { + // Reached a leaf, add to paths + paths.push(path); + } + } + + /// Create a [`ProjectionMask`] which selects only the named columns + /// + /// All leaf columns that fall below a given name will be selected. For example, given + /// the schema + /// ```ignore + /// message schema { + /// OPTIONAL group a (MAP) { + /// REPEATED group key_value { + /// REQUIRED BYTE_ARRAY key (UTF8); // leaf index 0 + /// OPTIONAL group value (MAP) { + /// REPEATED group key_value { + /// REQUIRED INT32 key; // leaf index 1 + /// REQUIRED BOOLEAN value; // leaf index 2 + /// } + /// } + /// } + /// } + /// REQUIRED INT32 b; // leaf index 3 + /// REQUIRED DOUBLE c; // leaf index 4 + /// } + /// ``` + /// `["a.key_value.value", "c"]` would return leaf columns 1, 2, and 4. `["a"]` would return + /// columns 0, 1, and 2. + /// + /// Note: repeated or out of order indices will not impact the final mask. + /// + /// i.e. `["b", "c"]` will construct the same mask as `["c", "b", "c"]`. + pub fn columns<'a>( + schema: &SchemaDescriptor, + names: impl IntoIterator, + ) -> Self { + // first make vector of paths for leaf columns + let mut paths: Vec = vec![]; + for root in schema.root_schema().get_fields() { + Self::find_leaves(root, None, &mut paths); + } + assert_eq!(paths.len(), schema.num_columns()); + + let mut mask = vec![false; schema.num_columns()]; + for name in names { + for idx in 0..schema.num_columns() { + if paths[idx].starts_with(name) { + mask[idx] = true; + } + } + } + + Self { mask: Some(mask) } + } + /// Returns true if the leaf column `leaf_idx` is included by the mask pub fn leaf_included(&self, leaf_idx: usize) -> bool { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) @@ -246,10 +313,14 @@ mod test { use crate::arrow::ArrowWriter; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter}; use crate::file::properties::{EnabledStatistics, WriterProperties}; + use crate::schema::parser::parse_message_type; + use crate::schema::types::SchemaDescriptor; use arrow_array::{ArrayRef, Int32Array, RecordBatch}; use bytes::Bytes; use std::sync::Arc; + use super::ProjectionMask; + #[test] // Reproducer for https://github.com/apache/arrow-rs/issues/6464 fn test_metadata_read_write_partial_offset() { @@ -375,4 +446,109 @@ mod test { .unwrap(); Bytes::from(buf) } + + #[test] + fn test_mask_from_column_names() { + let message_type = " + message test_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + } + } + } + REQUIRED INT32 b; + REQUIRED DOUBLE c; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["foo", "bar"]); + assert_eq!(mask.mask.unwrap(), vec![false; 5]); + + let mask = ProjectionMask::columns(&schema, []); + assert_eq!(mask.mask.unwrap(), vec![false; 5]); + + let mask = ProjectionMask::columns(&schema, ["a", "c"]); + assert_eq!(mask.mask.unwrap(), [true, true, true, false, true]); + + let mask = ProjectionMask::columns(&schema, ["a.key_value.key", "c"]); + assert_eq!(mask.mask.unwrap(), [true, false, false, false, true]); + + let mask = ProjectionMask::columns(&schema, ["a.key_value.value", "b"]); + assert_eq!(mask.mask.unwrap(), [false, true, true, true, false]); + + let message_type = " + message test_schema { + OPTIONAL group a (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL BYTE_ARRAY element (UTF8); + } + } + } + } + } + } + REQUIRED INT32 b; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = ProjectionMask::columns(&schema, ["a.list.element", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = + ProjectionMask::columns(&schema, ["a.list.element.list.element.list.element", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = ProjectionMask::columns(&schema, ["b"]); + assert_eq!(mask.mask.unwrap(), [false, true]); + + let message_type = " + message test_schema { + OPTIONAL INT32 a; + OPTIONAL INT32 b; + OPTIONAL INT32 c; + OPTIONAL INT32 d; + OPTIONAL INT32 e; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true, false, false, false]); + + let mask = ProjectionMask::columns(&schema, ["d", "b", "d"]); + assert_eq!(mask.mask.unwrap(), [false, true, false, true, false]); + + let message_type = " + message test_schema { + OPTIONAL INT32 a; + OPTIONAL INT32 b; + OPTIONAL INT32 a; + OPTIONAL INT32 d; + OPTIONAL INT32 e; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "e"]); + assert_eq!(mask.mask.unwrap(), [true, false, true, false, true]); + } } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index be7fa9a00d31..c9051062204d 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1400,6 +1400,17 @@ mod tests { for i in 0..arrow_fields.len() { assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } + + let mask = + ProjectionMask::columns(&parquet_schema, ["group2.leaf4", "group1.leaf1", "leaf5"]); + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(&parquet_schema, mask, None).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); + } } #[test] From 1e582ad38a2e7c45d7a4e75a5867c93f48fe2583 Mon Sep 17 00:00:00 2001 From: xxchan Date: Wed, 18 Dec 2024 21:58:19 +0800 Subject: [PATCH 11/68] doc: add comment for timezone string (#6899) * doc: add comment for timezone string Signed-off-by: xxchan * Update arrow-schema/src/datatype.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Signed-off-by: xxchan Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-schema/src/datatype.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 1964fc317a7b..eb5ea0c7cb3b 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -196,6 +196,14 @@ pub enum DataType { /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); /// ``` + /// + /// Timezone string parsing + /// ----------------------- + /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930". + /// + /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/) + /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + /// timezones. Timestamp(TimeUnit, Option>), /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days. From 4c2b75b9f09d651a687479b385a4226cda286da1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 19 Dec 2024 08:56:05 -0500 Subject: [PATCH 12/68] Update version to 54.0.0, add CHANGELOG (#6894) * Update version to 54.0.0 * Update changelog * update notes * updtes * update --- CHANGELOG-old.md | 170 ++++++++++++++++++++++++++++++ CHANGELOG.md | 173 +++++++++++++++++-------------- Cargo.toml | 32 +++--- arrow-flight/README.md | 2 +- dev/release/update_change_log.sh | 4 +- 5 files changed, 283 insertions(+), 98 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 376da6277114..3fb17b390ac1 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,176 @@ # Historical Changelog +## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) + +- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) +- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) + +**Closed issues:** + +- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) +- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) +- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) +- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) +- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) +- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) +- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) +- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) + +**Implemented enhancements:** + +- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555) +- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia)) +- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker)) + +**Fixed bugs:** + +- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) +- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) + +**Closed issues:** + +- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) +- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) +- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) +- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) +- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) +- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) +- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) +- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me)) +- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev)) +- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi)) ## [53.2.0](https://github.com/apache/arrow-rs/tree/53.2.0) (2024-10-21) [Full Changelog](https://github.com/apache/arrow-rs/compare/53.1.0...53.2.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b729360608b..a7f2a4ff34d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,101 +19,116 @@ # Changelog -## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) +## [54.0.0](https://github.com/apache/arrow-rs/tree/54.0.0) (2024-12-18) -[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.3.0...54.0.0) + +**Breaking changes:** + +- avoid redundant parsing of repeated value in RleDecoder [\#6834](https://github.com/apache/arrow-rs/pull/6834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Handling nullable DictionaryArray in CSV parser [\#6830](https://github.com/apache/arrow-rs/pull/6830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([edmondop](https://github.com/edmondop)) +- fix\(flightsql\): remove Any encoding of DoPutUpdateResult [\#6825](https://github.com/apache/arrow-rs/pull/6825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([davisp](https://github.com/davisp)) +- arrow-ipc: Default to not preserving dict IDs [\#6788](https://github.com/apache/arrow-rs/pull/6788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Remove some very old deprecated functions [\#6774](https://github.com/apache/arrow-rs/pull/6774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- update to pyo3 0.23.0 [\#6745](https://github.com/apache/arrow-rs/pull/6745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Remove APIs deprecated since v 4.4.0 [\#6722](https://github.com/apache/arrow-rs/pull/6722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Return `None` when Parquet page indexes are not present in file [\#6639](https://github.com/apache/arrow-rs/pull/6639) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add `ParquetError::NeedMoreData` mark `ParquetError` as `non_exhaustive` [\#6630](https://github.com/apache/arrow-rs/pull/6630) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove APIs deprecated since v 2.0.0 [\#6609](https://github.com/apache/arrow-rs/pull/6609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) **Implemented enhancements:** -- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555) -- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia)) -- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker)) +- Parquet schema hint doesn't support integer types upcasting [\#6891](https://github.com/apache/arrow-rs/issues/6891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet UTF-8 max statistics are overly pessimistic [\#6867](https://github.com/apache/arrow-rs/issues/6867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add builder support for Int8 keys [\#6844](https://github.com/apache/arrow-rs/issues/6844) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Formalize the name of the nested `Field` in a list [\#6784](https://github.com/apache/arrow-rs/issues/6784) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Allow disabling the writing of Parquet Offset Index [\#6778](https://github.com/apache/arrow-rs/issues/6778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `parquet::record::make_row` is not exposed to users, leaving no option to users to manually create `Row` objects [\#6761](https://github.com/apache/arrow-rs/issues/6761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Avoid `from_num_days_from_ce_opt` calls in `timestamp_s_to_datetime` if we don't need [\#6746](https://github.com/apache/arrow-rs/issues/6746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Temporal -\> Utf8View casting [\#6734](https://github.com/apache/arrow-rs/issues/6734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Option To Coerce List Type on Parquet Write [\#6733](https://github.com/apache/arrow-rs/issues/6733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Numeric -\> Utf8View casting [\#6714](https://github.com/apache/arrow-rs/issues/6714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Utf8View \<=\> boolean casting [\#6713](https://github.com/apache/arrow-rs/issues/6713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) -- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Buffer::bit_slice` loses length with byte-aligned offsets [\#6895](https://github.com/apache/arrow-rs/issues/6895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet arrow writer doesn't track memory size correctly for fixed sized lists [\#6839](https://github.com/apache/arrow-rs/issues/6839) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Casting Decimal128 to Decimal128 with smaller precision produces incorrect results in some cases [\#6833](https://github.com/apache/arrow-rs/issues/6833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Should empty nullable dictionary be parsed as null from arrow-csv? [\#6821](https://github.com/apache/arrow-rs/issues/6821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Array take doesn't make fields nullable [\#6809](https://github.com/apache/arrow-rs/issues/6809) +- Arrow Flight Encodes a Slice's List Offsets If the slice offset is starts with zero [\#6803](https://github.com/apache/arrow-rs/issues/6803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet readers incorrectly interpret legacy nested lists [\#6756](https://github.com/apache/arrow-rs/issues/6756) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- filter\_bits under-allocates resulting boolean buffer [\#6750](https://github.com/apache/arrow-rs/issues/6750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Multi-language support issues with Arrow FlightSQL client's execute\_update and execute\_ingest methods [\#6545](https://github.com/apache/arrow-rs/issues/6545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Documentation updates:** -- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Performance improvements:** - -- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Should we document at what rate deprecated APIs are removed? [\#6851](https://github.com/apache/arrow-rs/issues/6851) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix docstring for `Format::with_header` in `arrow-csv` [\#6856](https://github.com/apache/arrow-rs/pull/6856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kylebarron](https://github.com/kylebarron)) +- Add deprecation / API removal policy [\#6852](https://github.com/apache/arrow-rs/pull/6852) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Minor: add example for creating `SchemaDescriptor` [\#6841](https://github.com/apache/arrow-rs/pull/6841) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- chore: enrich panic context when BooleanBuffer fails to create [\#6810](https://github.com/apache/arrow-rs/pull/6810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) **Closed issues:** -- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] GetCatalogsBuilder does not sort the catalog names [\#6807](https://github.com/apache/arrow-rs/issues/6807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add a lint to automatically check for unused dependencies [\#6796](https://github.com/apache/arrow-rs/issues/6796) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Merged pull requests:** -- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) -- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) -- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) -- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) -- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) -- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) -- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) -- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) -- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) -- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) -- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) -- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) -- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) -- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) -- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) -- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) -- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) -- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) -- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) -- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) -- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) -- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me)) -- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev)) -- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi)) +- doc: add comment for timezone string [\#6899](https://github.com/apache/arrow-rs/pull/6899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) +- docs: fix typo [\#6890](https://github.com/apache/arrow-rs/pull/6890) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Minor: Fix deprecation notice for `arrow_to_parquet_schema` [\#6889](https://github.com/apache/arrow-rs/pull/6889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add Field::with\_dict\_is\_ordered [\#6885](https://github.com/apache/arrow-rs/pull/6885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Deprecate "max statistics size" property in `WriterProperties` [\#6884](https://github.com/apache/arrow-rs/pull/6884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add deprecation warnings for everything related to `dict_id` [\#6873](https://github.com/apache/arrow-rs/pull/6873) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([brancz](https://github.com/brancz)) +- Enable matching temporal as from\_type to Utf8View [\#6872](https://github.com/apache/arrow-rs/pull/6872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kev1n8](https://github.com/Kev1n8)) +- Enable string-based column projections from Parquet files [\#6871](https://github.com/apache/arrow-rs/pull/6871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Improvements to UTF-8 statistics truncation [\#6870](https://github.com/apache/arrow-rs/pull/6870) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- fix: make GetCatalogsBuilder sort catalog names [\#6864](https://github.com/apache/arrow-rs/pull/6864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([niebayes](https://github.com/niebayes)) +- add buffered data\_pages to parquet column writer total bytes estimation [\#6862](https://github.com/apache/arrow-rs/pull/6862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([onursatici](https://github.com/onursatici)) +- Update prost-build requirement from =0.13.3 to =0.13.4 [\#6860](https://github.com/apache/arrow-rs/pull/6860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Minor: add comments explaining bad MSRV, output in json [\#6857](https://github.com/apache/arrow-rs/pull/6857) ([alamb](https://github.com/alamb)) +- perf: Use Cow in get\_format\_string in FFI\_ArrowSchema [\#6853](https://github.com/apache/arrow-rs/pull/6853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- chore: add cast\_decimal benchmark [\#6850](https://github.com/apache/arrow-rs/pull/6850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- arrow-array::builder: support Int8, Int16 and Int64 keys [\#6845](https://github.com/apache/arrow-rs/pull/6845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ajwerner](https://github.com/ajwerner)) +- Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [\#6840](https://github.com/apache/arrow-rs/pull/6840) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Remove APIs deprecated in 50.0.0 [\#6838](https://github.com/apache/arrow-rs/pull/6838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- fix: decimal conversion looses value on lower precision [\#6836](https://github.com/apache/arrow-rs/pull/6836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([himadripal](https://github.com/himadripal)) +- Update sysinfo requirement from 0.32.0 to 0.33.0 [\#6835](https://github.com/apache/arrow-rs/pull/6835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Optionally coerce names of maps and lists to match Parquet specification [\#6828](https://github.com/apache/arrow-rs/pull/6828) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove deprecated unary\_dyn and try\_unary\_dyn [\#6824](https://github.com/apache/arrow-rs/pull/6824) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Remove deprecated flight\_data\_from\_arrow\_batch [\#6823](https://github.com/apache/arrow-rs/pull/6823) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- \[arrow-cast\] Support cast boolean from/to string view [\#6822](https://github.com/apache/arrow-rs/pull/6822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- Hook up Avro Decoder [\#6820](https://github.com/apache/arrow-rs/pull/6820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix arrow-avro compilation without default features [\#6819](https://github.com/apache/arrow-rs/pull/6819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Support shrink to empty [\#6817](https://github.com/apache/arrow-rs/pull/6817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[arrow-cast\] Support cast numeric to string view \(alternate\) [\#6816](https://github.com/apache/arrow-rs/pull/6816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Hide implicit optional dependency features in arrow-flight [\#6806](https://github.com/apache/arrow-rs/pull/6806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- fix: Encoding of List offsets was incorrect when slice offsets begin with zero [\#6805](https://github.com/apache/arrow-rs/pull/6805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HawaiianSpork](https://github.com/HawaiianSpork)) +- Enable unused\_crate\_dependencies Rust lint, remove unused dependencies [\#6804](https://github.com/apache/arrow-rs/pull/6804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Minor: Fix docstrings for `ColumnProperties::statistics_enabled` property [\#6798](https://github.com/apache/arrow-rs/pull/6798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add option to disable writing of Parquet offset index [\#6797](https://github.com/apache/arrow-rs/pull/6797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove unused dependencies [\#6792](https://github.com/apache/arrow-rs/pull/6792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Add `Array::shrink_to_fit(&mut self)` [\#6790](https://github.com/apache/arrow-rs/pull/6790) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([emilk](https://github.com/emilk)) +- Formalize the default nested list field name to `item` [\#6785](https://github.com/apache/arrow-rs/pull/6785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([gruuya](https://github.com/gruuya)) +- Improve UnionArray logical\_nulls tests [\#6781](https://github.com/apache/arrow-rs/pull/6781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gstvg](https://github.com/gstvg)) +- Improve list builder usage example in docs [\#6775](https://github.com/apache/arrow-rs/pull/6775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Update proc-macro2 requirement from =1.0.89 to =1.0.92 [\#6772](https://github.com/apache/arrow-rs/pull/6772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow NullBuffer construction directly from array [\#6769](https://github.com/apache/arrow-rs/pull/6769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Include license and notice files in published crates [\#6767](https://github.com/apache/arrow-rs/pull/6767) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([ankane](https://github.com/ankane)) +- fix: remove redundant `bit_util::ceil` [\#6766](https://github.com/apache/arrow-rs/pull/6766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([miroim](https://github.com/miroim)) +- Remove 'make\_row', expose a 'Row::new' method instead. [\#6763](https://github.com/apache/arrow-rs/pull/6763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jonded94](https://github.com/jonded94)) +- Read nested Parquet 2-level lists correctly [\#6757](https://github.com/apache/arrow-rs/pull/6757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Split `timestamp_s_to_datetime` to `date` and `time` to avoid unnecessary computation [\#6755](https://github.com/apache/arrow-rs/pull/6755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- More trivial implementation of `Box` and `Box` [\#6748](https://github.com/apache/arrow-rs/pull/6748) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ethe](https://github.com/ethe)) +- Update cache action to v4 [\#6744](https://github.com/apache/arrow-rs/pull/6744) ([findepi](https://github.com/findepi)) +- Remove redundant implementation of `StringArrayType` [\#6743](https://github.com/apache/arrow-rs/pull/6743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- Fix Dictionary logical nulls for RunArray/UnionArray Values [\#6740](https://github.com/apache/arrow-rs/pull/6740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Allow reading Parquet maps that lack a `values` field [\#6730](https://github.com/apache/arrow-rs/pull/6730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Improve default implementation of Array::is\_nullable [\#6721](https://github.com/apache/arrow-rs/pull/6721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix Buffer::bit\_slice losing length with byte-aligned offsets [\#6707](https://github.com/apache/arrow-rs/pull/6707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) diff --git a/Cargo.toml b/Cargo.toml index 375a4efac551..75ba410f12a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "53.3.0" +version = "54.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "53.3.0", path = "./arrow", default-features = false } -arrow-arith = { version = "53.3.0", path = "./arrow-arith" } -arrow-array = { version = "53.3.0", path = "./arrow-array" } -arrow-buffer = { version = "53.3.0", path = "./arrow-buffer" } -arrow-cast = { version = "53.3.0", path = "./arrow-cast" } -arrow-csv = { version = "53.3.0", path = "./arrow-csv" } -arrow-data = { version = "53.3.0", path = "./arrow-data" } -arrow-ipc = { version = "53.3.0", path = "./arrow-ipc" } -arrow-json = { version = "53.3.0", path = "./arrow-json" } -arrow-ord = { version = "53.3.0", path = "./arrow-ord" } -arrow-row = { version = "53.3.0", path = "./arrow-row" } -arrow-schema = { version = "53.3.0", path = "./arrow-schema" } -arrow-select = { version = "53.3.0", path = "./arrow-select" } -arrow-string = { version = "53.3.0", path = "./arrow-string" } -parquet = { version = "53.3.0", path = "./parquet", default-features = false } +arrow = { version = "54.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "54.0.0", path = "./arrow-arith" } +arrow-array = { version = "54.0.0", path = "./arrow-array" } +arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "54.0.0", path = "./arrow-cast" } +arrow-csv = { version = "54.0.0", path = "./arrow-csv" } +arrow-data = { version = "54.0.0", path = "./arrow-data" } +arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" } +arrow-json = { version = "54.0.0", path = "./arrow-json" } +arrow-ord = { version = "54.0.0", path = "./arrow-ord" } +arrow-row = { version = "54.0.0", path = "./arrow-row" } +arrow-schema = { version = "54.0.0", path = "./arrow-schema" } +arrow-select = { version = "54.0.0", path = "./arrow-select" } +arrow-string = { version = "54.0.0", path = "./arrow-string" } +parquet = { version = "54.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 661abfc58691..3ffc8780c2f8 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -31,7 +31,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "53.3.0" +arrow-flight = "54.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index d00cc498625f..4a2f5e3f1987 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="53.2.0" -FUTURE_RELEASE="53.3.0" +SINCE_TAG="53.3.0" +FUTURE_RELEASE="54.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 5f01ed410f07c711caab20662fb1ca96314b0f66 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 19 Dec 2024 14:51:30 -0500 Subject: [PATCH 13/68] [object store] Add planned release schedule (#6904) --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 723249ad29e5..ed42f630514b 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,13 @@ versions approximately every 2 months. [`object_store`]: https://crates.io/crates/object_store +Planned Release Schedule + +| Approximate Date | Version | Notes | +| ---------------- | -------- | --------------------------------------- | +| Dec 2024 | `0.11.2` | Minor, NO breaking API changes | +| Feb 2025 | `0.12.0` | Major, potentially breaking API changes | + ### Deprecation Guidelines Minor releases may deprecate, but not remove APIs. Deprecating APIs allows From f6eaca587c93afc2df262bc4b48c3dce13dc4793 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:52:18 +0200 Subject: [PATCH 14/68] add `extend_dictionary` in dictionary builder for improved performance (#6875) * add `extend_dictionary` in dictionary builder for improved performance * fix extends all nulls * support null in mapped value * adding comment * run `clippy` and `fmt` * fix ci * Apply suggestions from code review Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- .../generic_bytes_dictionary_builder.rs | 187 ++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 198 +++++++++++++++++- 2 files changed, 379 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index bb0fb5e91be2..ead151d5ceea 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -17,7 +17,7 @@ use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; use hashbrown::HashTable; @@ -305,6 +305,63 @@ where }; } + /// Extends builder with an existing dictionary array. + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -445,8 +502,9 @@ mod tests { use super::*; use crate::array::Int8Array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -664,4 +722,129 @@ mod tests { assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); assert_eq!(dict.values().len(), 4); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend([None::<&str>]); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 6); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some("e"), + Some("e"), + Some("f"), + Some("e"), + Some("d"), + Some("a"), + Some("b"), + Some("c"), + Some("a"), + Some("b"), + Some("c"), + None, + Some("c"), + Some("d"), + Some("a"), + None + ] + ); + } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = GenericByteBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value("I like worm hugs"); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(Utf8Type::DATA_TYPE), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_string::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some("I like worm hugs")] + ); + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some("I like worm hugs")]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac40f8a469d3..282f0ae9d5b1 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,9 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use crate::{ + Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray, +}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -44,7 +46,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`] /// /// # Example: /// @@ -303,6 +305,63 @@ where }; } + /// Extends builder with dictionary + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); @@ -368,9 +427,9 @@ impl Extend> mod tests { use super::*; - use crate::array::UInt32Array; - use crate::array::UInt8Array; + use crate::array::{Int32Array, UInt32Array, UInt8Array}; use crate::builder::Decimal128Builder; + use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] @@ -443,4 +502,135 @@ mod tests { ) ); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([None::]); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 7); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some(6), + Some(6), + Some(7), + Some(6), + Some(5), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + None, + Some(4), + Some(5), + Some(1), + Some(3), + Some(1), + None + ] + ); + } + + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = PrimitiveBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value(42); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_primitive::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some(42)] + ); + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some(42)]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } } From 02377a0a1df41d5b25bb2d363ca86b185b148245 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 16:18:55 -0500 Subject: [PATCH 15/68] [object_store]: Version and Changelog for 0.11.2 (#6908) * [object_store]: Version and Changelog for 0.11.2 * increment version * update script * changelog * tweaks * Update object_store/CHANGELOG.md Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/CHANGELOG-old.md | 39 ++++++++++++++ object_store/CHANGELOG.md | 51 ++++++++++--------- object_store/Cargo.toml | 2 +- object_store/dev/release/README.md | 5 +- object_store/dev/release/update_change_log.sh | 4 +- 5 files changed, 72 insertions(+), 29 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 28dbde4e7b7f..c42689240dd9 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,45 @@ # Historical Changelog + +## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) + +**Implemented enhancements:** + +- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) +- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) +- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) +- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) +- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) +- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) +- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) +- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) +- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) + + + ## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 95585983572c..0e834c5e2ef2 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,41 +19,42 @@ # Changelog -## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) +## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) **Implemented enhancements:** -- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) -- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) -- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) -- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) -- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) -- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) -- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) -- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) -- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) -- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) -- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) -- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) +- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) +- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) +- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) +- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) +- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) +- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bcc8e0b92243..bf254b3a0bbd 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.1" +version = "0.11.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md index 912ff4cd8bac..2dd1f6243c09 100644 --- a/object_store/dev/release/README.md +++ b/object_store/dev/release/README.md @@ -24,7 +24,10 @@ This file documents the release process for the `object_store` crate. -At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. +We release a new version of `object_store` according to the schedule listed in +the [main README.md] + +[main README.md]: https://github.com/apache/arrow-rs?tab=readme-ov-file#object_store-crate As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to main that has a breaking API change, as defined in [Rust RFC 1105] diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 30724478ae1e..2797b62c0010 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.11.0" -FUTURE_RELEASE="object_store_0.11.1" +SINCE_TAG="object_store_0.11.1" +FUTURE_RELEASE="object_store_0.11.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From d9885da7c61e83cf23f8b9e66fab40391e49fa03 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 24 Dec 2024 22:22:33 +0800 Subject: [PATCH 16/68] feat(parquet): Add next_row_group API for ParquetRecordBatchStream (#6907) * feat(parquet): Add next_row_group API for ParquetRecordBatchStream Signed-off-by: Xuanwo * chore: Returning error instead of using unreachable Signed-off-by: Xuanwo --------- Signed-off-by: Xuanwo --- parquet/src/arrow/async_reader/mod.rs | 132 ++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c408456df147..96715e1164b2 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -613,6 +613,9 @@ impl std::fmt::Debug for StreamState { /// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] /// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// +/// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, +/// allowing users to decode record batches separately from I/O. pub struct ParquetRecordBatchStream { metadata: Arc, @@ -654,6 +657,70 @@ impl ParquetRecordBatchStream { } } +impl ParquetRecordBatchStream +where + T: AsyncFileReader + Unpin + Send + 'static, +{ + /// Fetches the next row group from the stream. + /// + /// Users can continue to call this function to get row groups and decode them concurrently. + /// + /// ## Notes + /// + /// ParquetRecordBatchStream should be used either as a `Stream` or with `next_row_group`; they should not be used simultaneously. + /// + /// ## Returns + /// + /// - `Ok(None)` if the stream has ended. + /// - `Err(error)` if the stream has errored. All subsequent calls will return `Ok(None)`. + /// - `Ok(Some(reader))` which holds all the data for the row group. + pub async fn next_row_group(&mut self) -> Result> { + loop { + match &mut self.state { + StreamState::Decoding(_) | StreamState::Reading(_) => { + return Err(ParquetError::General( + "Cannot combine the use of next_row_group with the Stream API".to_string(), + )) + } + StreamState::Init => { + let row_group_idx = match self.row_groups.pop_front() { + Some(idx) => idx, + None => return Ok(None), + }; + + let row_count = self.metadata.row_group(row_group_idx).num_rows() as usize; + + let selection = self.selection.as_mut().map(|s| s.split_off(row_count)); + + let reader_factory = self.reader.take().expect("lost reader"); + + let (reader_factory, maybe_reader) = reader_factory + .read_row_group( + row_group_idx, + selection, + self.projection.clone(), + self.batch_size, + ) + .await + .map_err(|err| { + self.state = StreamState::Error; + err + })?; + self.reader = Some(reader_factory); + + if let Some(reader) = maybe_reader { + return Ok(Some(reader)); + } else { + // All rows skipped, read next row group + continue; + } + } + StreamState::Error => return Ok(None), // Ends the stream as error happens. + } + } + } +} + impl Stream for ParquetRecordBatchStream where T: AsyncFileReader + Unpin + Send + 'static, @@ -1020,6 +1087,71 @@ mod tests { ); } + #[tokio::test] + async fn test_async_reader_with_next_row_group() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_plain.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = ParquetMetaDataReader::new() + .parse_and_finish(&data) + .unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let requests = async_reader.requests.clone(); + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); + let mut stream = builder + .with_projection(mask.clone()) + .with_batch_size(1024) + .build() + .unwrap(); + + let mut readers = vec![]; + while let Some(reader) = stream.next_row_group().await.unwrap() { + readers.push(reader); + } + + let async_batches: Vec<_> = readers + .into_iter() + .flat_map(|r| r.map(|v| v.unwrap()).collect::>()) + .collect(); + + let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data) + .unwrap() + .with_projection(mask) + .with_batch_size(104) + .build() + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + + let requests = requests.lock().unwrap(); + let (offset_1, length_1) = metadata.row_group(0).column(1).byte_range(); + let (offset_2, length_2) = metadata.row_group(0).column(2).byte_range(); + + assert_eq!( + &requests[..], + &[ + offset_1 as usize..(offset_1 + length_1) as usize, + offset_2 as usize..(offset_2 + length_2) as usize + ] + ); + } + #[tokio::test] async fn test_async_reader_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); From 7ef432b277ef09a94e6a6898aeef8c402d863231 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:49:36 +0200 Subject: [PATCH 17/68] chore(arrow-ord): move `can_rank` to the `rank` file (#6910) --- arrow-ord/src/rank.rs | 9 +++++++++ arrow-ord/src/sort.rs | 11 +---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs index ecc693bab4e4..e61cebef38ec 100644 --- a/arrow-ord/src/rank.rs +++ b/arrow-ord/src/rank.rs @@ -24,6 +24,15 @@ use arrow_buffer::NullBuffer; use arrow_schema::{ArrowError, DataType, SortOptions}; use std::cmp::Ordering; +/// Whether `arrow_ord::rank` can rank an array of given data type. +pub(crate) fn can_rank(data_type: &DataType) -> bool { + data_type.is_primitive() + || matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary + ) +} + /// Assigns a rank to each value in `array` based on its position in the sorted order /// /// Where values are equal, they will be assigned the highest of their ranks, diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 60fc4a918525..51a6659e631b 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -30,7 +30,7 @@ use arrow_select::take::take; use std::cmp::Ordering; use std::sync::Arc; -use crate::rank::rank; +use crate::rank::{can_rank, rank}; pub use arrow_schema::SortOptions; /// Sort the `ArrayRef` using `SortOptions`. @@ -190,15 +190,6 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { } } -/// Whether `arrow_ord::rank` can rank an array of given data type. -fn can_rank(data_type: &DataType) -> bool { - data_type.is_primitive() - || matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary - ) -} - /// Whether `sort_to_indices` can sort an array of given data type. fn can_sort_to_indices(data_type: &DataType) -> bool { data_type.is_primitive() From df87b132b22ac8d0a8352aeaf8298414705220da Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sat, 19 Oct 2024 10:15:04 -0700 Subject: [PATCH 18/68] preliminary changes --- arrow-cast/src/cast/mod.rs | 4 ++-- arrow-data/src/data.rs | 4 ++++ arrow-schema/src/ffi.rs | 8 ++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 391ffce90cbe..d7edba261e6c 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -181,9 +181,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), - Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | + Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, // decimal to string - (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) | + (Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) => true, // string to decimal (Utf8View | Utf8 | LargeUtf8, Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _)) => true, (Struct(from_fields), Struct(to_fields)) => { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 6f016d213675..7c5b9ea52ed4 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -143,6 +143,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { [empty_buffer, MutableBuffer::new(0)] } + DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ + MutableBuffer::new(capacity * mem::size_of::()), + empty_buffer, + ], DataType::Union(_, mode) => { let type_ids = MutableBuffer::new(capacity * mem::size_of::()); match mode { diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index e99ea8d67899..56bc93559ed8 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -709,12 +709,8 @@ fn get_format_string(dtype: &DataType) -> Result, ArrowError> DataType::LargeUtf8 => Ok("U".into()), DataType::FixedSizeBinary(num_bytes) => Ok(Cow::Owned(format!("w:{num_bytes}"))), DataType::FixedSizeList(_, num_elems) => Ok(Cow::Owned(format!("+w:{num_elems}"))), - DataType::Decimal32(precision, scale) => { - Ok(Cow::Owned(format!("d:{precision},{scale},32"))) - } - DataType::Decimal64(precision, scale) => { - Ok(Cow::Owned(format!("d:{precision},{scale},64"))) - } + DataType::Decimal32(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale},32"))), + DataType::Decimal64(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale},64"))), DataType::Decimal128(precision, scale) => Ok(Cow::Owned(format!("d:{precision},{scale}"))), DataType::Decimal256(precision, scale) => { Ok(Cow::Owned(format!("d:{precision},{scale},256"))) From 78c899fe2f8b30abc96067f857fc7ba5f7e709fd Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sun, 12 Jan 2025 19:09:56 -0800 Subject: [PATCH 19/68] Decimal32/64 mostly done --- arrow-array/src/cast.rs | 12 + arrow-cast/src/cast/mod.rs | 117 +++++-- arrow-data/src/data.rs | 4 - arrow-json/src/writer/mod.rs | 48 +++ arrow-ord/src/comparison.rs | 208 ++++++++++++ arrow-ord/src/ord.rs | 28 +- arrow-ord/src/sort.rs | 319 ++++++------------ arrow-row/src/lib.rs | 60 ++++ arrow/tests/array_cast.rs | 48 ++- .../src/arrow/array_reader/primitive_array.rs | 26 +- parquet/src/arrow/arrow_reader/mod.rs | 79 ++++- parquet/src/arrow/schema/mod.rs | 2 + parquet/src/arrow/schema/primitive.rs | 4 +- parquet/tests/arrow_reader/mod.rs | 85 ++++- parquet/tests/arrow_reader/statistics.rs | 86 ++++- 15 files changed, 840 insertions(+), 286 deletions(-) diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index a06ca34a02e7..9947c36d4619 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -1014,6 +1014,18 @@ mod tests { assert!(!as_string_array(&array).is_empty()) } + #[test] + fn test_decimal32array() { + let a = Decimal32Array::from_iter_values([1, 2, 4, 5]); + assert!(!as_primitive_array::(&a).is_empty()); + } + + #[test] + fn test_decimal64array() { + let a = Decimal64Array::from_iter_values([1, 2, 4, 5]); + assert!(!as_primitive_array::(&a).is_empty()); + } + #[test] fn test_decimal128array() { let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index d7edba261e6c..483680b1d39d 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -830,6 +830,7 @@ pub fn cast_with_options( (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => { cast_map_values(array.as_map(), to_type, cast_options, ordered1.to_owned()) } + // Decimal to decimal, same width (Decimal32(p1, s1), Decimal32(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), @@ -840,6 +841,37 @@ pub fn cast_with_options( cast_options, ) } + (Decimal64(p1, s1), Decimal64(p2, s2)) => { + cast_decimal_to_decimal_same_type::( + array.as_primitive(), + *p1, + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal128(p1, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal_same_type::( + array.as_primitive(), + *p1, + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal256(p1, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal_same_type::( + array.as_primitive(), + *p1, + *s1, + *p2, + *s2, + cast_options, + ) + } + // Decimal to decimal, different width (Decimal32(_, s1), Decimal64(p2, s2)) => { cast_decimal_to_decimal::( array.as_primitive(), @@ -867,10 +899,9 @@ pub fn cast_with_options( cast_options, ) } - (Decimal64(p1, s1), Decimal64(p2, s2)) => { - cast_decimal_to_decimal_same_type::( + (Decimal64(_, s1), Decimal32(p2, s2)) => { + cast_decimal_to_decimal::( array.as_primitive(), - *p1, *s1, *p2, *s2, @@ -895,20 +926,18 @@ pub fn cast_with_options( cast_options, ) } - (Decimal128(p1, s1), Decimal128(p2, s2)) => { - cast_decimal_to_decimal_same_type::( + (Decimal128(_, s1), Decimal32(p2, s2)) => { + cast_decimal_to_decimal::( array.as_primitive(), - *p1, *s1, *p2, *s2, cast_options, ) } - (Decimal256(p1, s1), Decimal256(p2, s2)) => { - cast_decimal_to_decimal_same_type::( + (Decimal128(_, s1), Decimal64(p2, s2)) => { + cast_decimal_to_decimal::( array.as_primitive(), - *p1, *s1, *p2, *s2, @@ -924,6 +953,24 @@ pub fn cast_with_options( cast_options, ) } + (Decimal256(_, s1), Decimal32(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } + (Decimal256(_, s1), Decimal64(p2, s2)) => { + cast_decimal_to_decimal::( + array.as_primitive(), + *s1, + *p2, + *s2, + cast_options, + ) + } (Decimal256(_, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal::( array.as_primitive(), @@ -933,6 +980,7 @@ pub fn cast_with_options( cast_options, ) } + // Decimal to non-decimal (Decimal32(_, scale), _) if !to_type.is_temporal() => { cast_from_decimal::( array, @@ -977,6 +1025,7 @@ pub fn cast_with_options( cast_options, ) } + // Non-decimal to decimal (_, Decimal32(precision, scale)) if !from_type.is_temporal() => { cast_to_decimal::( array, @@ -2584,7 +2633,7 @@ mod tests { .with_precision_and_scale(precision, scale) } - fn create_decimal_array( + fn create_decimal128_array( array: Vec>, precision: u8, scale: i8, @@ -2653,7 +2702,7 @@ mod tests { Some(-3123456), None, ]; - let array = create_decimal_array(array, 20, 4).unwrap(); + let array = create_decimal128_array(array, 20, 4).unwrap(); // decimal128 to decimal128 let input_type = DataType::Decimal128(20, 4); let output_type = DataType::Decimal128(20, 3); @@ -2804,7 +2853,7 @@ mod tests { let output_type = DataType::Decimal128(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let array = create_decimal_array(array, 20, 3).unwrap(); + let array = create_decimal128_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal128Array, @@ -2818,7 +2867,7 @@ mod tests { ); // negative test let array = vec![Some(123456), None]; - let array = create_decimal_array(array, 10, 0).unwrap(); + let array = create_decimal128_array(array, 10, 0).unwrap(); let result_safe = cast(&array, &DataType::Decimal128(2, 2)); assert!(result_safe.is_ok()); let options = CastOptions { @@ -2874,7 +2923,7 @@ mod tests { ); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let array = create_decimal_array(array, p, s).unwrap(); + let array = create_decimal128_array(array, p, s).unwrap(); let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); assert_eq!(cast_array.data_type(), &output_type); } @@ -2890,7 +2939,7 @@ mod tests { ); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let array = create_decimal_array(array, p, s).unwrap(); + let array = create_decimal128_array(array, p, s).unwrap(); let cast_array = cast_with_options(&array, &output_type, &CastOptions::default()).unwrap(); assert_eq!(cast_array.data_type(), &output_type); } @@ -2942,7 +2991,7 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i128::MAX)]; - let array = create_decimal_array(array, 38, 3).unwrap(); + let array = create_decimal128_array(array, 38, 3).unwrap(); let result = cast_with_options( &array, &output_type, @@ -2962,7 +3011,7 @@ mod tests { assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(i128::MAX)]; - let array = create_decimal_array(array, 38, 3).unwrap(); + let array = create_decimal128_array(array, 38, 3).unwrap(); let result = cast_with_options( &array, &output_type, @@ -3019,7 +3068,7 @@ mod tests { let output_type = DataType::Decimal256(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; - let array = create_decimal_array(array, 20, 3).unwrap(); + let array = create_decimal128_array(array, 20, 3).unwrap(); generate_cast_test_case!( &array, Decimal256Array, @@ -3227,13 +3276,13 @@ mod tests { #[test] fn test_cast_decimal_to_numeric() { let value_array: Vec> = vec![Some(125), Some(225), Some(325), None, Some(525)]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = create_decimal128_array(value_array, 38, 2).unwrap(); generate_decimal_to_numeric_cast_test_case!(&array); // overflow test: out of range of max u8 let value_array: Vec> = vec![Some(51300)]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = create_decimal128_array(value_array, 38, 2).unwrap(); let casted_array = cast_with_options( &array, &DataType::UInt8, @@ -3260,7 +3309,7 @@ mod tests { // overflow test: out of range of max i8 let value_array: Vec> = vec![Some(24400)]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = create_decimal128_array(value_array, 38, 2).unwrap(); let casted_array = cast_with_options( &array, &DataType::Int8, @@ -3297,7 +3346,7 @@ mod tests { Some(112345678), Some(112345679), ]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = create_decimal128_array(value_array, 38, 2).unwrap(); generate_cast_test_case!( &array, Float32Array, @@ -3324,7 +3373,7 @@ mod tests { Some(112345678901234568), Some(112345678901234560), ]; - let array = create_decimal_array(value_array, 38, 2).unwrap(); + let array = create_decimal128_array(value_array, 38, 2).unwrap(); generate_cast_test_case!( &array, Float64Array, @@ -8638,7 +8687,7 @@ mod tests { let output_type = DataType::Decimal128(20, -1); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123450), Some(2123455), Some(3123456), None]; - let input_decimal_array = create_decimal_array(array, 20, 0).unwrap(); + let input_decimal_array = create_decimal128_array(array, 20, 0).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; generate_cast_test_case!( &array, @@ -8696,7 +8745,7 @@ mod tests { let output_type = DataType::Decimal128(10, -2); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(123)]; - let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); + let input_decimal_array = create_decimal128_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(12_i128),]); @@ -8706,7 +8755,7 @@ mod tests { assert_eq!("1200", decimal_arr.value_as_string(0)); let array = vec![Some(125)]; - let input_decimal_array = create_decimal_array(array, 10, -1).unwrap(); + let input_decimal_array = create_decimal128_array(array, 10, -1).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; generate_cast_test_case!(&array, Decimal128Array, &output_type, vec![Some(13_i128),]); @@ -8722,7 +8771,7 @@ mod tests { let output_type = DataType::Decimal256(10, 5); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(123456), Some(-123456)]; - let input_decimal_array = create_decimal_array(array, 10, 3).unwrap(); + let input_decimal_array = create_decimal128_array(array, 10, 3).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; let hundred = i256::from_i128(100); @@ -9586,15 +9635,15 @@ mod tests { test_decimal_to_string::( DataType::Utf8View, - create_decimal_array(array128.clone(), 7, 3).unwrap(), + create_decimal128_array(array128.clone(), 7, 3).unwrap(), ); test_decimal_to_string::( DataType::Utf8, - create_decimal_array(array128.clone(), 7, 3).unwrap(), + create_decimal128_array(array128.clone(), 7, 3).unwrap(), ); test_decimal_to_string::( DataType::LargeUtf8, - create_decimal_array(array128, 7, 3).unwrap(), + create_decimal128_array(array128, 7, 3).unwrap(), ); test_decimal_to_string::( @@ -10242,7 +10291,7 @@ mod tests { #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { let array = vec![Some(123456789)]; - let array = create_decimal_array(array, 24, 2).unwrap(); + let array = create_decimal128_array(array, 24, 2).unwrap(); println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 2); @@ -10260,7 +10309,7 @@ mod tests { #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() { let array = vec![Some(123456789)]; - let array = create_decimal_array(array, 24, 2).unwrap(); + let array = create_decimal128_array(array, 24, 2).unwrap(); println!("{:?}", array); let input_type = DataType::Decimal128(24, 4); let output_type = DataType::Decimal128(6, 2); @@ -10278,7 +10327,7 @@ mod tests { #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() { let array = vec![Some(123456789)]; - let array = create_decimal_array(array, 24, 2).unwrap(); + let array = create_decimal128_array(array, 24, 2).unwrap(); println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 3); @@ -10296,7 +10345,7 @@ mod tests { #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() { let array = vec![Some(123456789)]; - let array = create_decimal_array(array, 24, 2).unwrap(); + let array = create_decimal128_array(array, 24, 2).unwrap(); println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal256(6, 2); diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 7c5b9ea52ed4..6f016d213675 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -143,10 +143,6 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => { [empty_buffer, MutableBuffer::new(0)] } - DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], DataType::Union(_, mode) => { let type_ids = MutableBuffer::new(capacity * mem::size_of::()); match mode { diff --git a/arrow-json/src/writer/mod.rs b/arrow-json/src/writer/mod.rs index ee6d83a0a1f0..1cc43ec4e8d0 100644 --- a/arrow-json/src/writer/mod.rs +++ b/arrow-json/src/writer/mod.rs @@ -1878,6 +1878,54 @@ mod tests { ) } + #[test] + fn test_decimal32_encoder() { + let array = Decimal32Array::from_iter_values([1234, 5678, 9012]) + .with_precision_and_scale(8, 2) + .unwrap(); + let field = Arc::new(Field::new("decimal", array.data_type().clone(), true)); + let schema = Schema::new(vec![field]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"decimal":12.34} +{"decimal":56.78} +{"decimal":90.12} +"#, + ); + } + + #[test] + fn test_decimal64_encoder() { + let array = Decimal64Array::from_iter_values([1234, 5678, 9012]) + .with_precision_and_scale(10, 2) + .unwrap(); + let field = Arc::new(Field::new("decimal", array.data_type().clone(), true)); + let schema = Schema::new(vec![field]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"decimal":12.34} +{"decimal":56.78} +{"decimal":90.12} +"#, + ); + } + #[test] fn test_decimal128_encoder() { let array = Decimal128Array::from_iter_values([1234, 5678, 9012]) diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index bb82f54d4918..83f765229f57 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -3059,6 +3059,117 @@ mod tests { ); } + fn create_decimal_array(data: Vec>) -> PrimitiveArray { + data.into_iter().collect::>() + } + + fn test_cmp_dict_decimal(values1: Vec>, values2: Vec>) { + let values = create_decimal_array::(values1); + let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); + let array1 = DictionaryArray::new(keys, Arc::new(values)); + + let values = create_decimal_array::(values2); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::new(keys, Arc::new(values)); + + let expected = BooleanArray::from(vec![false, false, false, true, true, false]); + assert_eq!(crate::cmp::eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, false, false, true]); + assert_eq!(crate::cmp::lt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, true, true, true]); + assert_eq!(crate::cmp::lt_eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, false, false, false]); + assert_eq!(crate::cmp::gt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, true, true, false]); + assert_eq!(crate::cmp::gt_eq(&array1, &array2).unwrap(), expected); + } + + #[test] + fn test_cmp_dict_decimal32() { + test_cmp_dict_decimal::( + vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)], + vec![Some(7), Some(-3), Some(4), Some(3), Some(5)], + ); + } + + #[test] + fn test_cmp_dict_non_dict_decimal32() { + let array1: Decimal32Array = Decimal32Array::from_iter_values([1, 2, 5, 4, 3, 0]); + + let values = Decimal32Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::new(keys, Arc::new(values)); + + let expected = BooleanArray::from(vec![false, false, false, true, true, false]); + assert_eq!(crate::cmp::eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, false, false, true]); + assert_eq!(crate::cmp::lt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, true, true, true]); + assert_eq!(crate::cmp::lt_eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, false, false, false]); + assert_eq!(crate::cmp::gt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, true, true, false]); + assert_eq!(crate::cmp::gt_eq(&array1, &array2).unwrap(), expected); + } + + #[test] + fn test_cmp_dict_decimal64() { + let values = Decimal64Array::from_iter_values([0, 1, 2, 3, 4, 5]); + let keys = Int8Array::from_iter_values([1_i8, 2, 5, 4, 3, 0]); + let array1 = DictionaryArray::new(keys, Arc::new(values)); + + let values = Decimal64Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::new(keys, Arc::new(values)); + + let expected = BooleanArray::from(vec![false, false, false, true, true, false]); + assert_eq!(crate::cmp::eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, false, false, true]); + assert_eq!(crate::cmp::lt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, true, true, true]); + assert_eq!(crate::cmp::lt_eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, false, false, false]); + assert_eq!(crate::cmp::gt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, true, true, false]); + assert_eq!(crate::cmp::gt_eq(&array1, &array2).unwrap(), expected); + } + + #[test] + fn test_cmp_dict_non_dict_decimal64() { + let array1: Decimal64Array = Decimal64Array::from_iter_values([1, 2, 5, 4, 3, 0]); + + let values = Decimal64Array::from_iter_values([7, -3, 4, 3, 5]); + let keys = Int8Array::from_iter_values([0_i8, 0, 1, 2, 3, 4]); + let array2 = DictionaryArray::new(keys, Arc::new(values)); + + let expected = BooleanArray::from(vec![false, false, false, true, true, false]); + assert_eq!(crate::cmp::eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, false, false, true]); + assert_eq!(crate::cmp::lt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![true, true, false, true, true, true]); + assert_eq!(crate::cmp::lt_eq(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, false, false, false]); + assert_eq!(crate::cmp::gt(&array1, &array2).unwrap(), expected); + + let expected = BooleanArray::from(vec![false, false, true, true, true, false]); + assert_eq!(crate::cmp::gt_eq(&array1, &array2).unwrap(), expected); + } + #[test] fn test_cmp_dict_decimal128() { let values = Decimal128Array::from_iter_values([0, 1, 2, 3, 4, 5]); @@ -3163,6 +3274,103 @@ mod tests { assert_eq!(crate::cmp::gt_eq(&array1, &array2).unwrap(), expected); } + #[test] + fn test_decimal32() { + let a = Decimal32Array::from_iter_values([1, 2, 4, 5]); + let b = Decimal32Array::from_iter_values([7, -3, 4, 3]); + let e = BooleanArray::from(vec![false, false, true, false]); + let r = crate::cmp::eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, false, false]); + let r = crate::cmp::lt(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, true, false]); + let r = crate::cmp::lt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, false, true]); + let r = crate::cmp::gt(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, true, true]); + let r = crate::cmp::gt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + } + + #[test] + fn test_decimal32_scalar() { + let a = Decimal32Array::from(vec![Some(1), Some(2), Some(3), None, Some(4), Some(5)]); + let b = Decimal32Array::new_scalar(3_i32); + // array eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), None, Some(false), Some(false)], + ); + let r = crate::cmp::eq(&a, &b).unwrap(); + assert_eq!(e, r); + + // array neq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), None, Some(true), Some(true)], + ); + let r = crate::cmp::neq(&a, &b).unwrap(); + assert_eq!(e, r); + + // array lt scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(false), None, Some(false), Some(false)], + ); + let r = crate::cmp::lt(&a, &b).unwrap(); + assert_eq!(e, r); + + // array lt_eq scalar + let e = BooleanArray::from( + vec![Some(true), Some(true), Some(true), None, Some(false), Some(false)], + ); + let r = crate::cmp::lt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + // array gt scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(false), None, Some(true), Some(true)], + ); + let r = crate::cmp::gt(&a, &b).unwrap(); + assert_eq!(e, r); + + // array gt_eq scalar + let e = BooleanArray::from( + vec![Some(false), Some(false), Some(true), None, Some(true), Some(true)], + ); + let r = crate::cmp::gt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + } + + #[test] + fn test_decimal64() { + let a = Decimal64Array::from_iter_values([1, 2, 4, 5]); + let b = Decimal64Array::from_iter_values([7, -3, 4, 3]); + let e = BooleanArray::from(vec![false, false, true, false]); + let r = crate::cmp::eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, false, false]); + let r = crate::cmp::lt(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![true, false, true, false]); + let r = crate::cmp::lt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, false, true]); + let r = crate::cmp::gt(&a, &b).unwrap(); + assert_eq!(e, r); + + let e = BooleanArray::from(vec![false, true, true, true]); + let r = crate::cmp::gt_eq(&a, &b).unwrap(); + assert_eq!(e, r); + } + #[test] fn test_decimal128() { let a = Decimal128Array::from_iter_values([1, 2, 4, 5]); diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 55e397cd8aa4..9dc22e1f5a4c 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -549,7 +549,33 @@ mod tests { } #[test] - fn test_decimal() { + fn test_decimali32() { + let array = vec![Some(5_i32), Some(2_i32), Some(3_i32)] + .into_iter() + .collect::() + .with_precision_and_scale(8, 6) + .unwrap(); + + let cmp = make_comparator(&array, &array, SortOptions::default()).unwrap(); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Greater, cmp(0, 2)); + } + + #[test] + fn test_decimali64() { + let array = vec![Some(5_i64), Some(2_i64), Some(3_i64)] + .into_iter() + .collect::() + .with_precision_and_scale(16, 6) + .unwrap(); + + let cmp = make_comparator(&array, &array, SortOptions::default()).unwrap(); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Greater, cmp(0, 2)); + } + + #[test] + fn test_decimali128() { let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)] .into_iter() .collect::() diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 51a6659e631b..e18979524eb0 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -793,10 +793,18 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; - fn create_decimal128_array(data: Vec>) -> Decimal128Array { + fn create_decimal_array( + data: Vec>, + precision: u8, + scale: i8 + ) -> PrimitiveArray { data.into_iter() - .collect::() - .with_precision_and_scale(23, 6) + .map(|x| match x { + None => None, + Some(y) => T::Native::from_usize(y), + }) + .collect::>() + .with_precision_and_scale(precision, scale) .unwrap() } @@ -807,13 +815,15 @@ mod tests { .unwrap() } - fn test_sort_to_indices_decimal128_array( - data: Vec>, + fn test_sort_to_indices_decimal_array( + data: Vec>, options: Option, limit: Option, expected_data: Vec, + precision: u8, + scale: i8, ) { - let output = create_decimal128_array(data); + let output = create_decimal_array::(data, precision, scale); let expected = UInt32Array::from(expected_data); let output = sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); assert_eq!(output, expected) @@ -831,14 +841,16 @@ mod tests { assert_eq!(output, expected) } - fn test_sort_decimal128_array( - data: Vec>, + fn test_sort_decimal_array( + data: Vec>, options: Option, limit: Option, - expected_data: Vec>, + expected_data: Vec>, + p: u8, + s: i8, ) { - let output = create_decimal128_array(data); - let expected = Arc::new(create_decimal128_array(expected_data)) as ArrayRef; + let output = create_decimal_array::(data, p, s); + let expected = Arc::new(create_decimal_array::(expected_data, p, s)) as ArrayRef; let output = match limit { Some(_) => sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap(), _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), @@ -1541,17 +1553,18 @@ mod tests { ); } - #[test] - fn test_sort_indices_decimal128() { + fn test_sort_indices_decimal(precision: u8, scale: i8) { // decimal default - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, None, vec![0, 6, 4, 2, 3, 5, 1], + precision, + scale, ); // decimal descending - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1559,9 +1572,11 @@ mod tests { }), None, vec![1, 5, 3, 2, 4, 0, 6], + precision, + scale, ); // decimal null_first and descending - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1569,9 +1584,11 @@ mod tests { }), None, vec![0, 6, 1, 5, 3, 2, 4], + precision, + scale, ); // decimal null_first - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1579,16 +1596,20 @@ mod tests { }), None, vec![0, 6, 4, 2, 3, 5, 1], + precision, + scale, ); // limit - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, Some(3), vec![0, 6, 4], + precision, + scale, ); // limit descending - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1596,9 +1617,11 @@ mod tests { }), Some(3), vec![1, 5, 3], + precision, + scale, ); // limit descending null_first - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1606,9 +1629,11 @@ mod tests { }), Some(3), vec![0, 6, 1], + precision, + scale, ); // limit null_first - test_sort_to_indices_decimal128_array( + test_sort_to_indices_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1616,85 +1641,29 @@ mod tests { }), Some(3), vec![0, 6, 4], + precision, + scale, ); } #[test] - fn test_sort_indices_decimal256() { - let data = vec![ - None, - Some(i256::from_i128(5)), - Some(i256::from_i128(2)), - Some(i256::from_i128(3)), - Some(i256::from_i128(1)), - Some(i256::from_i128(4)), - None, - ]; + fn test_sort_indices_decimal32() { + test_sort_indices_decimal::(8, 3); + } - // decimal default - test_sort_to_indices_decimal256_array(data.clone(), None, None, vec![0, 6, 4, 2, 3, 5, 1]); - // decimal descending - test_sort_to_indices_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![1, 5, 3, 2, 4, 0, 6], - ); - // decimal null_first and descending - test_sort_to_indices_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![0, 6, 1, 5, 3, 2, 4], - ); - // decimal null_first - test_sort_to_indices_decimal256_array( - data.clone(), - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![0, 6, 4, 2, 3, 5, 1], - ); - // limit - test_sort_to_indices_decimal256_array(data.clone(), None, Some(3), vec![0, 6, 4]); - // limit descending - test_sort_to_indices_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: false, - }), - Some(3), - vec![1, 5, 3], - ); - // limit descending null_first - test_sort_to_indices_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![0, 6, 1], - ); - // limit null_first - test_sort_to_indices_decimal256_array( - data, - Some(SortOptions { - descending: false, - nulls_first: true, - }), - Some(3), - vec![0, 6, 4], - ); + #[test] + fn test_sort_indices_decimal64() { + test_sort_indices_decimal::(17, 5); + } + + #[test] + fn test_sort_indices_decimal128() { + test_sort_indices_decimal::(23, 6); + } + + #[test] + fn test_sort_indices_decimal256() { + test_sort_indices_decimal::(53, 6); } #[test] @@ -1747,17 +1716,18 @@ mod tests { ); } - #[test] - fn test_sort_decimal128() { + fn test_sort_decimal(precision: u8, scale: i8) { // decimal default - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, None, vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], + precision, + scale, ); // decimal descending - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1765,9 +1735,11 @@ mod tests { }), None, vec![Some(5), Some(4), Some(3), Some(2), Some(1), None, None], + precision, + scale, ); // decimal null_first and descending - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1775,9 +1747,11 @@ mod tests { }), None, vec![None, None, Some(5), Some(4), Some(3), Some(2), Some(1)], - ); + precision, + scale, + ); // decimal null_first - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1785,16 +1759,20 @@ mod tests { }), None, vec![None, None, Some(1), Some(2), Some(3), Some(4), Some(5)], + precision, + scale, ); // limit - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], None, Some(3), vec![None, None, Some(1)], + precision, + scale, ); // limit descending - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1802,9 +1780,11 @@ mod tests { }), Some(3), vec![Some(5), Some(4), Some(3)], + precision, + scale, ); // limit descending null_first - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: true, @@ -1812,9 +1792,11 @@ mod tests { }), Some(3), vec![None, None, Some(5)], + precision, + scale, ); // limit null_first - test_sort_decimal128_array( + test_sort_decimal_array::( vec![None, Some(5), Some(2), Some(3), Some(1), Some(4), None], Some(SortOptions { descending: false, @@ -1822,118 +1804,29 @@ mod tests { }), Some(3), vec![None, None, Some(1)], + precision, + scale, ); } + #[test] + fn test_sort_decimal32() { + test_sort_decimal::(8, 3); + } + + #[test] + fn test_sort_decimal64() { + test_sort_decimal::(17, 5); + } + + #[test] + fn test_sort_decimal128() { + test_sort_decimal::(23, 6); + } + #[test] fn test_sort_decimal256() { - let data = vec![ - None, - Some(i256::from_i128(5)), - Some(i256::from_i128(2)), - Some(i256::from_i128(3)), - Some(i256::from_i128(1)), - Some(i256::from_i128(4)), - None, - ]; - // decimal default - test_sort_decimal256_array( - data.clone(), - None, - None, - [None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // decimal descending - test_sort_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - [Some(5), Some(4), Some(3), Some(2), Some(1), None, None] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // decimal null_first and descending - test_sort_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - [None, None, Some(5), Some(4), Some(3), Some(2), Some(1)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // decimal null_first - test_sort_decimal256_array( - data.clone(), - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - [None, None, Some(1), Some(2), Some(3), Some(4), Some(5)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // limit - test_sort_decimal256_array( - data.clone(), - None, - Some(3), - [None, None, Some(1)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // limit descending - test_sort_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: false, - }), - Some(3), - [Some(5), Some(4), Some(3)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // limit descending null_first - test_sort_decimal256_array( - data.clone(), - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - [None, None, Some(5)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); - // limit null_first - test_sort_decimal256_array( - data, - Some(SortOptions { - descending: false, - nulls_first: true, - }), - Some(3), - [None, None, Some(1)] - .iter() - .map(|v| v.map(i256::from_i128)) - .collect(), - ); + test_sort_decimal::(53, 6); } #[test] diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index d0fad12210db..5421029304cd 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1510,6 +1510,66 @@ mod tests { } } + #[test] + fn test_decimal32() { + let converter = RowConverter::new(vec![SortField::new(DataType::Decimal32( + DECIMAL32_MAX_PRECISION, + 7, + ))]) + .unwrap(); + let col = Arc::new( + Decimal32Array::from_iter([ + None, + Some(i32::MIN), + Some(-13), + Some(46_i32), + Some(5456_i32), + Some(i32::MAX), + ]) + .with_precision_and_scale(9, 7) + .unwrap(), + ) as ArrayRef; + + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); + for i in 0..rows.num_rows() - 1 { + assert!(rows.row(i) < rows.row(i + 1)); + } + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(col.as_ref(), back[0].as_ref()) + } + + #[test] + fn test_decimal64() { + let converter = RowConverter::new(vec![SortField::new(DataType::Decimal64( + DECIMAL64_MAX_PRECISION, + 7, + ))]) + .unwrap(); + let col = Arc::new( + Decimal64Array::from_iter([ + None, + Some(i64::MIN), + Some(-13), + Some(46_i64), + Some(5456_i64), + Some(i64::MAX), + ]) + .with_precision_and_scale(18, 7) + .unwrap(), + ) as ArrayRef; + + let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap(); + for i in 0..rows.num_rows() - 1 { + assert!(rows.row(i) < rows.row(i + 1)); + } + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 1); + assert_eq!(col.as_ref(), back[0].as_ref()) + } + #[test] fn test_decimal128() { let converter = RowConverter::new(vec![SortField::new(DataType::Decimal128( diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index bf9962b69f7b..addaafb540ca 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -24,7 +24,8 @@ use arrow_array::types::{ }; use arrow_array::{ Array, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, + Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray, LargeListArray, @@ -262,7 +263,14 @@ fn get_arrays_of_all_types() -> Vec { Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - Arc::new(create_decimal_array(vec![Some(1), Some(2), Some(3)], 38, 0).unwrap()), + Arc::new(create_decimal32_array(vec![Some(1), Some(2), Some(3)], 9, 0).unwrap()), + Arc::new(create_decimal64_array(vec![Some(1), Some(2), Some(3)], 18, 0).unwrap()), + Arc::new(create_decimal128_array(vec![Some(1), Some(2), Some(3)], 38, 0).unwrap()), + Arc::new(create_decimal256_array(vec![ + Some(i256::from_i128(1)), + Some(i256::from_i128(2)), + Some(i256::from_i128(3)) + ], 40, 0).unwrap()), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), make_dictionary_primitive::(vec![1, 2]), @@ -428,7 +436,29 @@ fn make_dictionary_utf8() -> ArrayRef { Arc::new(b.finish()) } -fn create_decimal_array( +fn create_decimal32_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + +fn create_decimal64_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + +fn create_decimal128_array( array: Vec>, precision: u8, scale: i8, @@ -439,6 +469,17 @@ fn create_decimal_array( .with_precision_and_scale(precision, scale) } +fn create_decimal256_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + // Get a selection of datatypes to try and cast to fn get_all_types() -> Vec { use DataType::*; @@ -519,6 +560,7 @@ fn get_all_types() -> Vec { Dictionary(Box::new(key_type.clone()), Box::new(Binary)), Dictionary(Box::new(key_type.clone()), Box::new(LargeBinary)), Dictionary(Box::new(key_type.clone()), Box::new(Decimal32(9, 0))), + Dictionary(Box::new(key_type.clone()), Box::new(Decimal64(18, 0))), Dictionary(Box::new(key_type.clone()), Box::new(Decimal128(38, 0))), Dictionary(Box::new(key_type), Box::new(Decimal256(76, 0))), ] diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 375db933b511..c7b3e69450a6 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -25,7 +25,7 @@ use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow_array::{ builder::TimestampNanosecondBufferBuilder, ArrayRef, BooleanArray, - Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, + Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array, }; @@ -220,10 +220,30 @@ where let a = arrow_cast::cast(&array, &ArrowType::Date32)?; arrow_cast::cast(&a, target_type)? } - ArrowType::Decimal128(p, s) => { + ArrowType::Decimal64(p, s) if *(array.data_type()) == ArrowType::Int32 => { // Apply conversion to all elements regardless of null slots as the conversion - // to `i128` is infallible. This improves performance by avoiding a branch in + // to `i64` is infallible. This improves performance by avoiding a branch in // the inner loop (see docs for `PrimitiveArray::unary`). + let array = match array.data_type() { + ArrowType::Int32 => array + .as_any() + .downcast_ref::() + .unwrap() + .unary(|i| i as i64) + as Decimal64Array, + _ => { + return Err(arrow_err!( + "Cannot convert {:?} to decimal", + array.data_type() + )); + } + } + .with_precision_and_scale(*p, *s)?; + + Arc::new(array) as ArrayRef + } + ArrowType::Decimal128(p, s) => { + // See above comment. Conversion to `i128` is likewise infallible. let array = match array.data_type() { ArrowType::Int32 => array .as_any() diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6eba04c86f91..6feedfcf8e0f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -932,8 +932,9 @@ mod tests { use arrow_array::builder::*; use arrow_array::cast::AsArray; use arrow_array::types::{ - Date32Type, Date64Type, Decimal128Type, Decimal256Type, DecimalType, Float16Type, - Float32Type, Float64Type, Time32MillisecondType, Time64MicrosecondType, + Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, + DecimalType, Float16Type, Float32Type, Float64Type, Time32MillisecondType, + Time64MicrosecondType, }; use arrow_array::*; use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime}; @@ -4024,6 +4025,78 @@ mod tests { assert_eq!(out, batch.slice(2, 1)); } + fn test_decimal32_roundtrip() { + let d = |values: Vec, p: u8| { + let iter = values.into_iter(); + PrimitiveArray::::from_iter_values(iter) + .with_precision_and_scale(p, 2) + .unwrap() + }; + + let d1 = d(vec![1, 2, 3, 4, 5], 9); + let batch = RecordBatch::try_from_iter([ + ("d1", Arc::new(d1) as ArrayRef), + ]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)).unwrap(); + let t1 = builder.parquet_schema().columns()[0].physical_type(); + assert_eq!(t1, PhysicalType::INT32); + + let mut reader = builder.build().unwrap(); + assert_eq!(batch.schema(), reader.schema()); + + let out = reader.next().unwrap().unwrap(); + assert_eq!(batch, out); + } + + fn test_decimal64_roundtrip() { + // Precision <= 9 -> INT32 + // Precision <= 18 -> INT64 + + let d = |values: Vec, p: u8| { + let iter = values.into_iter(); + PrimitiveArray::::from_iter_values(iter) + .with_precision_and_scale(p, 2) + .unwrap() + }; + + let d1 = d(vec![1, 2, 3, 4, 5], 9); + let d2 = d(vec![1, 2, 3, 4, 10.pow(10) - 1], 10); + let d3 = d(vec![1, 2, 3, 4, 10.pow(18) - 1], 18); + + let batch = RecordBatch::try_from_iter([ + ("d1", Arc::new(d1) as ArrayRef), + ("d2", Arc::new(d2) as ArrayRef), + ("d3", Arc::new(d3) as ArrayRef), + ]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)).unwrap(); + let t1 = builder.parquet_schema().columns()[0].physical_type(); + assert_eq!(t1, PhysicalType::INT32); + let t2 = builder.parquet_schema().columns()[1].physical_type(); + assert_eq!(t2, PhysicalType::INT64); + let t3 = builder.parquet_schema().columns()[2].physical_type(); + assert_eq!(t3, PhysicalType::INT64); + + let mut reader = builder.build().unwrap(); + assert_eq!(batch.schema(), reader.schema()); + + let out = reader.next().unwrap().unwrap(); + assert_eq!(batch, out); + } + fn test_decimal_roundtrip() { // Precision <= 9 -> INT32 // Precision <= 18 -> INT64 @@ -4073,6 +4146,8 @@ mod tests { #[test] fn test_decimal() { + test_decimal32_roundtrip(); + test_decimal64_roundtrip(); test_decimal_roundtrip::(); test_decimal_roundtrip::(); } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index c9051062204d..689f7a103276 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -2014,6 +2014,8 @@ mod tests { false, // fails to roundtrip keys_sorted false, ), + Field::new("c42", DataType::Decimal32(5, 2), false), + Field::new("c43", DataType::Decimal64(18, 12), true), ], meta(&[("Key", "Value")]), ); diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 9f215b4dc07e..522582b408e2 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -69,7 +69,9 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { // Determine interval time unit (#1666) (DataType::Interval(_), DataType::Interval(_)) => hint, - // Promote to Decimal256 + // Promote to Decimal256 or narrow to Decimal32 or Decimal64 + (DataType::Decimal128(_, _), DataType::Decimal32(_, _)) => hint, + (DataType::Decimal128(_, _), DataType::Decimal64(_, _)) => hint, (DataType::Decimal128(_, _), DataType::Decimal256(_, _)) => hint, // Potentially preserve dictionary encoding diff --git a/parquet/tests/arrow_reader/mod.rs b/parquet/tests/arrow_reader/mod.rs index 0e6783583cd5..e6bec8279658 100644 --- a/parquet/tests/arrow_reader/mod.rs +++ b/parquet/tests/arrow_reader/mod.rs @@ -18,12 +18,13 @@ use arrow_array::types::{Int32Type, Int8Type}; use arrow_array::{ Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, - LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray, - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, + FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatch, StringArray, + StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, + Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use arrow_buffer::i256; use arrow_schema::{DataType, Field, Schema, TimeUnit}; @@ -84,7 +85,9 @@ enum Scenario { Float16, Float32, Float64, - Decimal, + Decimal32, + Decimal64, + Decimal128, Decimal256, ByteArray, Dictionary, @@ -369,13 +372,49 @@ fn make_f16_batch(v: Vec) -> RecordBatch { RecordBatch::try_new(schema, vec![array.clone()]).unwrap() } -/// Return record batch with decimal vector +/// Return record batch with decimal32 vector /// /// Columns are named -/// "decimal_col" -> DecimalArray -fn make_decimal_batch(v: Vec, precision: u8, scale: i8) -> RecordBatch { +/// "decimal32_col" -> Decimal32Array +fn make_decimal32_batch(v: Vec, precision: u8, scale: i8) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new( - "decimal_col", + "decimal32_col", + DataType::Decimal32(precision, scale), + true, + )])); + let array = Arc::new( + Decimal32Array::from(v) + .with_precision_and_scale(precision, scale) + .unwrap(), + ) as ArrayRef; + RecordBatch::try_new(schema, vec![array.clone()]).unwrap() +} + +/// Return record batch with decimal64 vector +/// +/// Columns are named +/// "decimal64_col" -> Decimal64Array +fn make_decimal64_batch(v: Vec, precision: u8, scale: i8) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new( + "decimal64_col", + DataType::Decimal64(precision, scale), + true, + )])); + let array = Arc::new( + Decimal64Array::from(v) + .with_precision_and_scale(precision, scale) + .unwrap(), + ) as ArrayRef; + RecordBatch::try_new(schema, vec![array.clone()]).unwrap() +} + +/// Return record batch with decimal128 vector +/// +/// Columns are named +/// "decimal128_col" -> Decimal128Array +fn make_decimal128_batch(v: Vec, precision: u8, scale: i8) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new( + "decimal128_col", DataType::Decimal128(precision, scale), true, )])); @@ -730,12 +769,28 @@ fn create_data_batch(scenario: Scenario) -> Vec { make_f64_batch(vec![5.0, 6.0, 7.0, 8.0, 9.0]), ] } - Scenario::Decimal => { + Scenario::Decimal32 => { + // decimal record batch + vec![ + make_decimal32_batch(vec![100, 200, 300, 400, 600], 9, 2), + make_decimal32_batch(vec![-500, 100, 300, 400, 600], 9, 2), + make_decimal32_batch(vec![2000, 3000, 3000, 4000, 6000], 9, 2), + ] + } + Scenario::Decimal64 => { + // decimal record batch + vec![ + make_decimal64_batch(vec![100, 200, 300, 400, 600], 9, 2), + make_decimal64_batch(vec![-500, 100, 300, 400, 600], 9, 2), + make_decimal64_batch(vec![2000, 3000, 3000, 4000, 6000], 9, 2), + ] + } + Scenario::Decimal128 => { // decimal record batch vec![ - make_decimal_batch(vec![100, 200, 300, 400, 600], 9, 2), - make_decimal_batch(vec![-500, 100, 300, 400, 600], 9, 2), - make_decimal_batch(vec![2000, 3000, 3000, 4000, 6000], 9, 2), + make_decimal128_batch(vec![100, 200, 300, 400, 600], 9, 2), + make_decimal128_batch(vec![-500, 100, 300, 400, 600], 9, 2), + make_decimal128_batch(vec![2000, 3000, 3000, 4000, 6000], 9, 2), ] } Scenario::Decimal256 => { diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 0eb0fc2b277f..64e92d972c54 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -31,12 +31,13 @@ use arrow::datatypes::{ }; use arrow_array::{ make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, - Date32Array, Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, - LargeStringArray, RecordBatch, StringArray, StringViewArray, Time32MillisecondArray, - Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Date32Array, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, + FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatch, StringArray, + StringViewArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; use half::f16; @@ -526,6 +527,9 @@ async fn test_data_page_stats_with_all_null_page() { DataType::Utf8, DataType::LargeUtf8, DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + DataType::Decimal32(8, 2), // as INT32 + DataType::Decimal64(8, 2), // as INT32 + DataType::Decimal64(10, 2), // as INT64 DataType::Decimal128(8, 2), // as INT32 DataType::Decimal128(10, 2), // as INT64 DataType::Decimal128(20, 2), // as FIXED_LEN_BYTE_ARRAY @@ -1713,11 +1717,71 @@ async fn test_float16() { } #[tokio::test] -async fn test_decimal() { - // This creates a parquet file of 1 column "decimal_col" with decimal data type and precicion 9, scale 2 +async fn test_decimal32() { + // This creates a parquet file of 1 column "decimal32_col" with decimal data type and precicion 9, scale 2 // file has 3 record batches, each has 5 rows. They will be saved into 3 row groups let reader = TestReader { - scenario: Scenario::Decimal, + scenario: Scenario::Decimal32, + row_per_group: 5, + } + .build() + .await; + + Test { + reader: &reader, + expected_min: Arc::new( + Decimal32Array::from(vec![100, -500, 2000]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + expected_max: Arc::new( + Decimal32Array::from(vec![600, 600, 6000]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + expected_null_counts: UInt64Array::from(vec![0, 0, 0]), + expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + column_name: "decimal32_col", + check: Check::Both, + } + .run(); +} +#[tokio::test] +async fn test_decimal64() { + // This creates a parquet file of 1 column "decimal64_col" with decimal data type and precicion 9, scale 2 + // file has 3 record batches, each has 5 rows. They will be saved into 3 row groups + let reader = TestReader { + scenario: Scenario::Decimal64, + row_per_group: 5, + } + .build() + .await; + + Test { + reader: &reader, + expected_min: Arc::new( + Decimal64Array::from(vec![100, -500, 2000]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + expected_max: Arc::new( + Decimal64Array::from(vec![600, 600, 6000]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + expected_null_counts: UInt64Array::from(vec![0, 0, 0]), + expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + column_name: "decimal64_col", + check: Check::Both, + } + .run(); +} +#[tokio::test] +async fn test_decimal128() { + // This creates a parquet file of 1 column "decimal128_col" with decimal data type and precicion 9, scale 2 + // file has 3 record batches, each has 5 rows. They will be saved into 3 row groups + let reader = TestReader { + scenario: Scenario::Decimal128, row_per_group: 5, } .build() @@ -1737,7 +1801,7 @@ async fn test_decimal() { ), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), - column_name: "decimal_col", + column_name: "decimal128_col", check: Check::Both, } .run(); @@ -2316,6 +2380,8 @@ mod test { // DataType::Struct(Fields), // DataType::Union(UnionFields, UnionMode), // DataType::Dictionary(Box, Box), + // DataType::Decimal32(u8, i8), + // DataType::Decimal64(u8, i8), // DataType::Decimal128(u8, i8), // DataType::Decimal256(u8, i8), // DataType::Map(FieldRef, bool), From 48d5b441bb24020e6868ec6446585ce84de508ca Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Sun, 29 Dec 2024 14:22:46 +0000 Subject: [PATCH 20/68] Fix error message typos with Parquet compression (#6918) --- parquet/src/basic.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 97e8c22f1b2f..99f122fe4c3e 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -426,14 +426,19 @@ fn split_compression_string(str_setting: &str) -> Result<(&str, Option), Pa fn check_level_is_none(level: &Option) -> Result<(), ParquetError> { if level.is_some() { - return Err(ParquetError::General("level is not support".to_string())); + return Err(ParquetError::General( + "compression level is not supported".to_string(), + )); } Ok(()) } fn require_level(codec: &str, level: Option) -> Result { - level.ok_or(ParquetError::General(format!("{} require level", codec))) + level.ok_or(ParquetError::General(format!( + "{} requires a compression level", + codec + ))) } impl FromStr for Compression { From 3b96eaa1535cd0003c7ba3db68d1cece56f6f58c Mon Sep 17 00:00:00 2001 From: wiedld Date: Mon, 30 Dec 2024 05:57:59 -0500 Subject: [PATCH 21/68] chore: expose arrow-schema methods, for use when writing parquet outside of ArrowWriter (#6916) --- parquet/src/arrow/mod.rs | 4 ++-- parquet/src/arrow/schema/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 6777e00fb05c..1305bbac83f0 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -123,8 +123,8 @@ use arrow_schema::{FieldRef, Schema}; pub use self::schema::arrow_to_parquet_schema; pub use self::schema::{ - parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, - ArrowSchemaConverter, FieldLevels, + add_encoded_arrow_schema_to_metadata, encode_arrow_schema, parquet_to_arrow_field_levels, + parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 689f7a103276..d1fa3eeb84d7 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -170,7 +170,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { } /// Encodes the Arrow schema into the IPC format, and base64 encodes it -fn encode_arrow_schema(schema: &Schema) -> String { +pub fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); #[allow(deprecated)] let mut dictionary_tracker = @@ -192,7 +192,7 @@ fn encode_arrow_schema(schema: &Schema) -> String { /// Mutates writer metadata by storing the encoded Arrow schema. /// If there is an existing Arrow schema metadata, it is replaced. -pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { +pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); let schema_kv = KeyValue { From bf24a301ba2a25f442a30262753ab5b811af7794 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Mon, 30 Dec 2024 19:58:56 +0900 Subject: [PATCH 22/68] Improve error message for unsupported cast between struct and other types (#6919) --- arrow-cast/src/cast/mod.rs | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 483680b1d39d..0946af53a60f 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -1081,12 +1081,12 @@ pub fn cast_with_options( let array = StructArray::try_new(to_fields.clone(), fields, array.nulls().cloned())?; Ok(Arc::new(array) as ArrayRef) } - (Struct(_), _) => Err(ArrowError::CastError( - "Cannot cast from struct to other types except struct".to_string(), - )), - (_, Struct(_)) => Err(ArrowError::CastError( - "Cannot cast to struct from other types except struct".to_string(), - )), + (Struct(_), _) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), + (_, Struct(_)) => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported" + ))), (_, Boolean) => match from_type { UInt8 => cast_numeric_to_bool::(array), UInt16 => cast_numeric_to_bool::(array), @@ -10288,6 +10288,32 @@ mod tests { ); } + #[test] + fn test_cast_struct_to_non_struct() { + let boolean = Arc::new(BooleanArray::from(vec![true, false])); + let struct_array = StructArray::from(vec![( + Arc::new(Field::new("a", DataType::Boolean, false)), + boolean.clone() as ArrayRef, + )]); + let to_type = DataType::Utf8; + let result = cast(&struct_array, &to_type); + assert_eq!( + r#"Cast error: Casting from Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) to Utf8 not supported"#, + result.unwrap_err().to_string() + ); + } + + #[test] + fn test_cast_non_struct_to_struct() { + let array = StringArray::from(vec!["a", "b"]); + let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into()); + let result = cast(&array, &to_type); + assert_eq!( + r#"Cast error: Casting from Utf8 to Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) not supported"#, + result.unwrap_err().to_string() + ); + } + #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { let array = vec![Some(123456789)]; From d58348d1f934ba56e6a426867b95c720176ebce5 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Mon, 30 Dec 2024 19:42:37 +0700 Subject: [PATCH 23/68] [arrow-string] Implement string view support for `regexp_match` (#6849) * [arrow-string] Implement string view suport for regexp match Signed-off-by: Tai Le Manh * update unit tests * fix clippy warnings * Add test cases Signed-off-by: Tai Le Manh --------- Signed-off-by: Tai Le Manh --- arrow-string/src/regexp.rs | 640 ++++++++++++++++++++++++++----------- 1 file changed, 452 insertions(+), 188 deletions(-) diff --git a/arrow-string/src/regexp.rs b/arrow-string/src/regexp.rs index d14662be7280..f3893cd5bd13 100644 --- a/arrow-string/src/regexp.rs +++ b/arrow-string/src/regexp.rs @@ -20,7 +20,9 @@ use crate::like::StringArrayType; -use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder}; +use arrow_array::builder::{ + BooleanBufferBuilder, GenericStringBuilder, ListBuilder, StringViewBuilder, +}; use arrow_array::cast::AsArray; use arrow_array::*; use arrow_buffer::NullBuffer; @@ -243,78 +245,96 @@ where Ok(BooleanArray::from(data)) } -fn regexp_array_match( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> Result { - let mut patterns: HashMap = HashMap::new(); - let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut list_builder = ListBuilder::new(builder); +macro_rules! process_regexp_array_match { + ($array:expr, $regex_array:expr, $flags_array:expr, $list_builder:expr) => { + let mut patterns: HashMap = HashMap::new(); - let complete_pattern = match flags_array { - Some(flags) => Box::new( - regex_array - .iter() - .zip(flags.iter()) - .map(|(pattern, flags)| { + let complete_pattern = match $flags_array { + Some(flags) => Box::new($regex_array.iter().zip(flags.iter()).map( + |(pattern, flags)| { pattern.map(|pattern| match flags { Some(value) => format!("(?{value}){pattern}"), None => pattern.to_string(), }) - }), - ) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; + }, + )) as Box>>, + None => Box::new( + $regex_array + .iter() + .map(|pattern| pattern.map(|pattern| pattern.to_string())), + ), + }; - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - match re.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); + $array + .iter() + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + (Some(_), Some(pattern)) if pattern == *"" => { + $list_builder.values().append_value(""); + $list_builder.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re, + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + ArrowError::ComputeError(format!( + "Regular expression did not compile: {e:?}" + )) + })?; + patterns.entry(pattern).or_insert(re) } + }; + match re.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { + $list_builder.values().append_value(m.as_str()); + } - list_builder.append(true); + $list_builder.append(true); + } + None => $list_builder.append(false), } - None => list_builder.append(false), } + _ => $list_builder.append(false), } - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; + Ok(()) + }) + .collect::, ArrowError>>()?; + }; +} + +fn regexp_array_match( + array: &GenericStringArray, + regex_array: &GenericStringArray, + flags_array: Option<&GenericStringArray>, +) -> Result { + let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_array_match!(array, regex_array, flags_array, list_builder); + + Ok(Arc::new(list_builder.finish())) +} + +fn regexp_array_match_utf8view( + array: &StringViewArray, + regex_array: &StringViewArray, + flags_array: Option<&StringViewArray>, +) -> Result { + let builder = StringViewBuilder::with_capacity(0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_array_match!(array, regex_array, flags_array, list_builder); + Ok(Arc::new(list_builder.finish())) } @@ -333,6 +353,54 @@ fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>( } } +fn get_scalar_pattern_flag_utf8view<'a>( + regex_array: &'a dyn Array, + flag_array: Option<&'a dyn Array>, +) -> (Option<&'a str>, Option<&'a str>) { + let regex = regex_array.as_string_view(); + let regex = regex.is_valid(0).then(|| regex.value(0)); + + if let Some(flag_array) = flag_array { + let flag = flag_array.as_string_view(); + (regex, flag.is_valid(0).then(|| flag.value(0))) + } else { + (regex, None) + } +} + +macro_rules! process_regexp_match { + ($array:expr, $regex:expr, $list_builder:expr) => { + $array + .iter() + .map(|value| { + match value { + // Required for Postgres compatibility: + // SELECT regexp_match('foobarbequebaz', ''); = {""} + Some(_) if $regex.as_str().is_empty() => { + $list_builder.values().append_value(""); + $list_builder.append(true); + } + Some(value) => match $regex.captures(value) { + Some(caps) => { + let mut iter = caps.iter(); + if caps.len() > 1 { + iter.next(); + } + for m in iter.flatten() { + $list_builder.values().append_value(m.as_str()); + } + $list_builder.append(true); + } + None => $list_builder.append(false), + }, + None => $list_builder.append(false), + } + Ok(()) + }) + .collect::, ArrowError>>()? + }; +} + fn regexp_scalar_match( array: &GenericStringArray, regex: &Regex, @@ -340,35 +408,19 @@ fn regexp_scalar_match( let builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); let mut list_builder = ListBuilder::new(builder); - array - .iter() - .map(|value| { - match value { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - Some(_) if regex.as_str() == "" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - Some(value) => match regex.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); - } + process_regexp_match!(array, regex, list_builder); - list_builder.append(true); - } - None => list_builder.append(false), - }, - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; + Ok(Arc::new(list_builder.finish())) +} + +fn regexp_scalar_match_utf8view( + array: &StringViewArray, + regex: &Regex, +) -> Result { + let builder = StringViewBuilder::with_capacity(0); + let mut list_builder = ListBuilder::new(builder); + + process_regexp_match!(array, regex, list_builder); Ok(Arc::new(list_builder.finish())) } @@ -406,7 +458,7 @@ pub fn regexp_match( if array.data_type() != rhs.data_type() { return Err(ArrowError::ComputeError( - "regexp_match() requires both array and pattern to be either Utf8 or LargeUtf8" + "regexp_match() requires both array and pattern to be either Utf8, Utf8View or LargeUtf8" .to_string(), )); } @@ -428,7 +480,7 @@ pub fn regexp_match( if flags_array.is_some() && rhs.data_type() != flags.unwrap().data_type() { return Err(ArrowError::ComputeError( - "regexp_match() requires both pattern and flags to be either string or largestring" + "regexp_match() requires both pattern and flags to be either Utf8, Utf8View or LargeUtf8" .to_string(), )); } @@ -436,11 +488,13 @@ pub fn regexp_match( if is_rhs_scalar { // Regex and flag is scalars let (regex, flag) = match rhs.data_type() { + DataType::Utf8View => get_scalar_pattern_flag_utf8view(rhs, flags), DataType::Utf8 => get_scalar_pattern_flag::(rhs, flags), DataType::LargeUtf8 => get_scalar_pattern_flag::(rhs, flags), _ => { return Err(ArrowError::ComputeError( - "regexp_match() requires pattern to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires pattern to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )); } }; @@ -468,14 +522,21 @@ pub fn regexp_match( })?; match array.data_type() { + DataType::Utf8View => regexp_scalar_match_utf8view(array.as_string_view(), &re), DataType::Utf8 => regexp_scalar_match(array.as_string::(), &re), DataType::LargeUtf8 => regexp_scalar_match(array.as_string::(), &re), _ => Err(ArrowError::ComputeError( - "regexp_match() requires array to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )), } } else { match array.data_type() { + DataType::Utf8View => { + let regex_array = rhs.as_string_view(); + let flags_array = flags.map(|flags| flags.as_string_view()); + regexp_array_match_utf8view(array.as_string_view(), regex_array, flags_array) + } DataType::Utf8 => { let regex_array = rhs.as_string(); let flags_array = flags.map(|flags| flags.as_string()); @@ -487,7 +548,8 @@ pub fn regexp_match( regexp_array_match(array.as_string::(), regex_array, flags_array) } _ => Err(ArrowError::ComputeError( - "regexp_match() requires array to be either Utf8 or LargeUtf8".to_string(), + "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8" + .to_string(), )), } } @@ -497,114 +559,316 @@ pub fn regexp_match( mod tests { use super::*; - #[test] - fn match_single_group() { - let values = vec![ + macro_rules! test_match_single_group { + ($test_name:ident, $values:expr, $patterns:expr, $arr_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $arr_type = <$arr_type>::from($values); + let pattern: $arr_type = <$arr_type>::from($patterns); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; + } + + test_match_single_group!( + match_single_group_string, + vec![ Some("abc-005-def"), Some("X-7-5"), Some("X545"), None, Some("foobarbequebaz"), Some("foobarbequebaz"), - ]; - let array = StringArray::from(values); - let mut pattern_values = vec![r".*-(\d*)-.*"; 4]; - pattern_values.push(r"(bar)(bequ1e)"); - pattern_values.push(""); - let pattern = GenericStringArray::::from(pattern_values); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("005"); - expected_builder.append(true); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.values().append_value(""); - expected_builder.append(true); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - } + ], + vec![ + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r"(bar)(bequ1e)", + "" + ], + StringArray, + GenericStringBuilder, + [Some("005"), Some("7"), None, None, None, Some("")] + ); + test_match_single_group!( + match_single_group_string_view, + vec![ + Some("abc-005-def"), + Some("X-7-5"), + Some("X545"), + None, + Some("foobarbequebaz"), + Some("foobarbequebaz"), + ], + vec![ + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r".*-(\d*)-.*", + r"(bar)(bequ1e)", + "" + ], + StringViewArray, + StringViewBuilder, + [Some("005"), Some("7"), None, None, None, Some("")] + ); + + macro_rules! test_match_single_group_with_flags { + ($test_name:ident, $values:expr, $patterns:expr, $flags:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern: $array_type = <$array_type>::from($patterns); + let flags: $array_type = <$array_type>::from($flags); + + let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - #[test] - fn match_single_group_with_flags() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]); - let flags = StringArray::from(vec!["i"; 4]); - let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => { + expected_builder.append(false); + } + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn match_scalar_pattern() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = Scalar::new(StringArray::from(vec![r"x.*-(\d*)-.*"; 1])); - let flags = Scalar::new(StringArray::from(vec!["i"; 1])); - let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.values().append_value("7"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - - // No flag - let values = vec![Some("abc-005-def"), Some("x-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + test_match_single_group_with_flags!( + match_single_group_with_flags_string, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + vec![r"x.*-(\d*)-.*"; 4], + vec!["i"; 4], + StringArray, + GenericStringBuilder, + [None, Some("7"), None, None] + ); + test_match_single_group_with_flags!( + match_single_group_with_flags_stringview, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + vec![r"x.*-(\d*)-.*"; 4], + vec!["i"; 4], + StringViewArray, + StringViewBuilder, + [None, Some("7"), None, None] + ); + + macro_rules! test_match_scalar_pattern { + ($test_name:ident, $values:expr, $pattern:expr, $flag:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + + let pattern_scalar = Scalar::new(<$array_type>::from(vec![$pattern; 1])); + let flag_scalar = Scalar::new(<$array_type>::from(vec![$flag; 1])); + + let actual = regexp_match(&array, &pattern_scalar, Some(&flag_scalar)).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn match_scalar_no_pattern() { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = Scalar::new(new_null_array(&DataType::Utf8, 1)); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::with_capacity(0, 0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); + test_match_scalar_pattern!( + match_scalar_pattern_string_with_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + Some("i"), + StringArray, + GenericStringBuilder, + [None, Some("7"), Some("0"), None, None] + ); + test_match_scalar_pattern!( + match_scalar_pattern_stringview_with_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + Some("i"), + StringViewArray, + StringViewBuilder, + [None, Some("7"), Some("0"), None, None] + ); + + test_match_scalar_pattern!( + match_scalar_pattern_string_no_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + None::<&str>, + StringArray, + GenericStringBuilder, + [None, Some("7"), None, None, None] + ); + test_match_scalar_pattern!( + match_scalar_pattern_stringview_no_flags, + vec![ + Some("abc-005-def"), + Some("x-7-5"), + Some("X-0-Y"), + Some("X545"), + None + ], + r"x.*-(\d*)-.*", + None::<&str>, + StringViewArray, + StringViewBuilder, + [None, Some("7"), None, None, None] + ); + + macro_rules! test_match_scalar_no_pattern { + ($test_name:ident, $values:expr, $array_type:ty, $pattern_type:expr, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern = Scalar::new(new_null_array(&$pattern_type, 1)); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } - #[test] - fn test_single_group_not_skip_match() { - let array = StringArray::from(vec![Some("foo"), Some("bar")]); - let pattern = GenericStringArray::::from(vec![r"foo"]); - let actual = regexp_match(&array, &pattern, None).unwrap(); - let result = actual.as_any().downcast_ref::().unwrap(); - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("foo"); - expected_builder.append(true); - let expected = expected_builder.finish(); - assert_eq!(&expected, result); + test_match_scalar_no_pattern!( + match_scalar_no_pattern_string, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + StringArray, + DataType::Utf8, + GenericStringBuilder, + [None::<&str>, None, None, None] + ); + test_match_scalar_no_pattern!( + match_scalar_no_pattern_stringview, + vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None], + StringViewArray, + DataType::Utf8View, + StringViewBuilder, + [None::<&str>, None, None, None] + ); + + macro_rules! test_match_single_group_not_skip { + ($test_name:ident, $values:expr, $pattern:expr, $array_type:ty, $builder_type:ty, $expected:expr) => { + #[test] + fn $test_name() { + let array: $array_type = <$array_type>::from($values); + let pattern: $array_type = <$array_type>::from(vec![$pattern]); + + let actual = regexp_match(&array, &pattern, None).unwrap(); + + let elem_builder: $builder_type = <$builder_type>::new(); + let mut expected_builder = ListBuilder::new(elem_builder); + + for val in $expected { + match val { + Some(v) => { + expected_builder.values().append_value(v); + expected_builder.append(true); + } + None => expected_builder.append(false), + } + } + + let expected = expected_builder.finish(); + let result = actual.as_any().downcast_ref::().unwrap(); + assert_eq!(&expected, result); + } + }; } + test_match_single_group_not_skip!( + match_single_group_not_skip_string, + vec![Some("foo"), Some("bar")], + r"foo", + StringArray, + GenericStringBuilder, + [Some("foo")] + ); + test_match_single_group_not_skip!( + match_single_group_not_skip_stringview, + vec![Some("foo"), Some("bar")], + r"foo", + StringViewArray, + StringViewBuilder, + [Some("foo")] + ); + macro_rules! test_flag_utf8 { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] From 8880bde3de808e1d159ae77964525f064124fd9d Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 1 Jan 2025 06:25:04 -0800 Subject: [PATCH 24/68] Add doctest example for `Buffer::from_bytes` (#6920) * Add doctest example for * Remove typo * Update arrow-buffer/src/buffer/immutable.rs --------- Co-authored-by: Andrew Lamb --- arrow-buffer/src/buffer/immutable.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index d0c8ffa39783..cf1d6f366751 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -60,6 +60,14 @@ unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { /// Auxiliary method to create a new Buffer + /// + /// This can be used with a [`bytes::Bytes`] via `into()`: + /// + /// ``` + /// # use arrow_buffer::Buffer; + /// let bytes = bytes::Bytes::from_static(b"foo"); + /// let buffer = Buffer::from_bytes(bytes.into()); + /// ``` #[inline] pub fn from_bytes(bytes: Bytes) -> Self { let length = bytes.len(); From 7289a998f33d1243a9447968970a1650a217796a Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 10:09:34 +0100 Subject: [PATCH 25/68] object_store: Add enabled-by-default "fs" feature (#6636) --- .github/workflows/object_store.yml | 4 ++++ object_store/Cargo.toml | 4 +++- object_store/src/chunked.rs | 4 ++++ object_store/src/lib.rs | 17 +++++++++++------ object_store/src/limit.rs | 1 + object_store/src/parse.rs | 4 ++-- object_store/src/throttle.rs | 2 ++ object_store/src/util.rs | 2 +- 8 files changed, 28 insertions(+), 10 deletions(-) diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index 93f809aaabd4..899318f01324 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -54,6 +54,10 @@ jobs: # targets. - name: Run clippy with default features run: cargo clippy -- -D warnings + - name: Run clippy without default features + run: cargo clippy --no-default-features -- -D warnings + - name: Run clippy with fs features + run: cargo clippy --no-default-features --features fs -- -D warnings - name: Run clippy with aws feature run: cargo clippy --features aws -- -D warnings - name: Run clippy with gcp feature diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bf254b3a0bbd..a127be3602ef 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -41,7 +41,7 @@ percent-encoding = "2.1" snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } tracing = { version = "0.1" } url = "2.2" -walkdir = "2" +walkdir = { version = "2", optional = true } # Cloud storage support base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } @@ -61,8 +61,10 @@ httparse = { version = "1.8.0", default-features = false, features = ["std"], op nix = { version = "0.29.0", features = ["fs"] } [features] +default = ["fs"] cloud = ["serde", "serde_json", "quick-xml", "hyper", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "base64", "rand", "ring"] azure = ["cloud", "httparse"] +fs = ["walkdir"] gcp = ["cloud", "rustls-pemfile"] aws = ["cloud", "md-5"] http = ["cloud"] diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 98cc20498013..3f83c1336dc4 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -86,6 +86,7 @@ impl ObjectStore for ChunkedStore { async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let r = self.inner.get_opts(location, options).await?; let stream = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { crate::local::chunked_stream(file, path, r.range.clone(), self.chunk_size) } @@ -178,7 +179,9 @@ impl ObjectStore for ChunkedStore { mod tests { use futures::StreamExt; + #[cfg(feature = "fs")] use crate::integration::*; + #[cfg(feature = "fs")] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -209,6 +212,7 @@ mod tests { } } + #[cfg(feature = "fs")] #[tokio::test] async fn test_chunked() { let temporary = tempfile::tempdir().unwrap(); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 4d8d8f02a0bc..6f5733226922 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -66,10 +66,13 @@ //! By default, this crate provides the following implementations: //! //! * Memory: [`InMemory`](memory::InMemory) -//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) //! //! Feature flags are used to enable support for other implementations: //! +#![cfg_attr( + feature = "fs", + doc = "* Local filesystem: [`LocalFileSystem`](local::LocalFileSystem)" +)] #![cfg_attr( feature = "gcp", doc = "* [`gcp`]: [Google Cloud Storage](https://cloud.google.com/storage/) support. See [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder)" @@ -513,7 +516,7 @@ pub mod gcp; #[cfg(feature = "http")] pub mod http; pub mod limit; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] pub mod local; pub mod memory; pub mod path; @@ -557,7 +560,7 @@ pub use upload::*; pub use util::{coalesce_ranges, collect_bytes, GetRange, OBJECT_STORE_COALESCE_DEFAULT}; use crate::path::Path; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::util::maybe_spawn_blocking; use async_trait::async_trait; use bytes::Bytes; @@ -565,7 +568,7 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use snafu::Snafu; use std::fmt::{Debug, Formatter}; -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; use std::ops::Range; use std::sync::Arc; @@ -1028,6 +1031,7 @@ pub struct GetResult { /// be able to optimise the case of a file already present on local disk pub enum GetResultPayload { /// The file, path + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] File(std::fs::File, std::path::PathBuf), /// An opaque stream of bytes Stream(BoxStream<'static, Result>), @@ -1036,6 +1040,7 @@ pub enum GetResultPayload { impl Debug for GetResultPayload { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] Self::File(_, _) => write!(f, "GetResultPayload(File)"), Self::Stream(_) => write!(f, "GetResultPayload(Stream)"), } @@ -1047,7 +1052,7 @@ impl GetResult { pub async fn bytes(self) -> Result { let len = self.range.end - self.range.start; match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(mut file, path) => { maybe_spawn_blocking(move || { file.seek(SeekFrom::Start(self.range.start as _)) @@ -1087,7 +1092,7 @@ impl GetResult { /// no additional complexity or overheads pub fn into_stream(self) -> BoxStream<'static, Result> { match self.payload { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(file, path) => { const CHUNK_SIZE: usize = 8 * 1024; local::chunked_stream(file, path, self.range, CHUNK_SIZE) diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 64b96ad1a96c..6a3c3b574e62 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -199,6 +199,7 @@ impl ObjectStore for LimitStore { fn permit_get_result(r: GetResult, permit: OwnedSemaphorePermit) -> GetResult { let payload = match r.payload { + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] v @ GetResultPayload::File(_, _) => v, GetResultPayload::Stream(s) => { GetResultPayload::Stream(PermitWrapper::new(s, permit).boxed()) diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index debc9e529312..a3919305281d 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; @@ -179,7 +179,7 @@ where let path = Path::parse(path)?; let store = match scheme { - #[cfg(not(target_arch = "wasm32"))] + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _, ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _, #[cfg(feature = "aws")] diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index d07276c3dcad..b9dff5c6d1d2 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -307,8 +307,10 @@ fn usize_to_u32_saturate(x: usize) -> u32 { } fn throttle_get(result: GetResult, wait_get_per_byte: Duration) -> GetResult { + #[allow(clippy::infallible_destructuring_match)] let s = match result.payload { GetResultPayload::Stream(s) => s, + #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] GetResultPayload::File(_, _) => unimplemented!(), }; diff --git a/object_store/src/util.rs b/object_store/src/util.rs index ecf90f95d7c7..99102a99e61e 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -75,7 +75,7 @@ where } } -#[cfg(not(target_arch = "wasm32"))] +#[cfg(all(feature = "fs", not(target_arch = "wasm32")))] /// Takes a function and spawns it to a tokio blocking pool if available pub(crate) async fn maybe_spawn_blocking(f: F) -> Result where From 95dae6a21a3ead3e9eca5eea35cfc2bcceb3d8dd Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 2 Jan 2025 23:25:44 +0100 Subject: [PATCH 26/68] object_store: Migrate from `snafu` to `thiserror` (#6266) * object_store: Add `thiserror` dependency * object_store/memory: Migrate from `snafu` to `thiserror` * object_store/parse: Migrate from `snafu` to `thiserror` * object_store/util: Migrate from `snafu` to `thiserror` * object_store/local: Migrate from `snafu` to `thiserror` * object_store/delimited: Migrate from `snafu` to `thiserror` * object_store/path/parts: Migrate from `snafu` to `thiserror` * object_store/path: Migrate from `snafu` to `thiserror` * object_store/http: Migrate from `snafu` to `thiserror` * object_store/client: Migrate from `snafu` to `thiserror` * object_store/aws: Migrate from `snafu` to `thiserror` * object_store/azure: Migrate from `snafu` to `thiserror` * object_store/gcp: Migrate from `snafu` to `thiserror` * object_store/lib: Migrate from `snafu` to `thiserror` * Remove `snafu` dependency --- object_store/Cargo.toml | 2 +- object_store/src/aws/builder.rs | 52 +++++--- object_store/src/aws/client.rs | 87 ++++++------ object_store/src/aws/credential.rs | 17 ++- object_store/src/aws/resolve.rs | 30 ++--- object_store/src/azure/builder.rs | 65 +++++---- object_store/src/azure/client.rs | 93 +++++++------ object_store/src/azure/credential.rs | 41 +++--- object_store/src/client/get.rs | 97 +++++++------- object_store/src/client/header.rs | 54 +++++--- object_store/src/client/retry.rs | 13 +- object_store/src/delimited.rs | 15 ++- object_store/src/gcp/builder.rs | 48 ++++--- object_store/src/gcp/client.rs | 91 +++++++------ object_store/src/gcp/credential.rs | 57 ++++---- object_store/src/http/client.rs | 52 +++++--- object_store/src/http/mod.rs | 13 +- object_store/src/lib.rs | 36 +++-- object_store/src/local.rs | 191 ++++++++++++--------------- object_store/src/memory.rs | 31 ++--- object_store/src/parse.rs | 12 +- object_store/src/path/mod.rs | 35 +++-- object_store/src/path/parts.rs | 7 +- object_store/src/util.rs | 9 +- 24 files changed, 620 insertions(+), 528 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a127be3602ef..6f5e9db1bc70 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -38,7 +38,7 @@ humantime = "2.1" itertools = "0.13.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" -snafu = { version = "0.8", default-features = false, features = ["std", "rust_1_61"] } +thiserror = "2.0.2" tracing = { version = "0.1" } url = "2.2" walkdir = { version = "2", optional = true } diff --git a/object_store/src/aws/builder.rs b/object_store/src/aws/builder.rs index 840245a7b5d4..d29fa782e8ff 100644 --- a/object_store/src/aws/builder.rs +++ b/object_store/src/aws/builder.rs @@ -32,7 +32,6 @@ use itertools::Itertools; use md5::{Digest, Md5}; use reqwest::header::{HeaderMap, HeaderValue}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -43,46 +42,46 @@ use url::Url; static DEFAULT_METADATA_ENDPOINT: &str = "http://169.254.169.254"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName, - #[snafu(display("Missing AccessKeyId"))] + #[error("Missing AccessKeyId")] MissingAccessKeyId, - #[snafu(display("Missing SecretAccessKey"))] + #[error("Missing SecretAccessKey")] MissingSecretAccessKey, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("Invalid Zone suffix for bucket '{bucket}'"))] + #[error("Invalid Zone suffix for bucket '{bucket}'")] ZoneSuffix { bucket: String }, - #[snafu(display("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed))] + #[error("Invalid encryption type: {}. Valid values are \"AES256\", \"sse:kms\", \"sse:kms:dsse\" and \"sse-c\".", passed)] InvalidEncryptionType { passed: String }, - #[snafu(display( + #[error( "Invalid encryption header values. Header: {}, source: {}", header, source - ))] + )] InvalidEncryptionHeader { header: &'static str, source: Box, @@ -603,8 +602,15 @@ impl AmazonS3Builder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; + match parsed.scheme() { "s3" | "s3a" => self.bucket_name = Some(host.to_string()), "https" => match host.splitn(4, '.').collect_tuple() { @@ -630,9 +636,12 @@ impl AmazonS3Builder { self.bucket_name = Some(bucket.into()); } } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } }; Ok(()) } @@ -875,7 +884,7 @@ impl AmazonS3Builder { self.parse_url(&url)?; } - let bucket = self.bucket_name.context(MissingBucketNameSnafu)?; + let bucket = self.bucket_name.ok_or(Error::MissingBucketName)?; let region = self.region.unwrap_or_else(|| "us-east-1".to_string()); let checksum = self.checksum_algorithm.map(|x| x.get()).transpose()?; let copy_if_not_exists = self.copy_if_not_exists.map(|x| x.get()).transpose()?; @@ -957,7 +966,10 @@ impl AmazonS3Builder { let (session_provider, zonal_endpoint) = match self.s3_express.get()? { true => { - let zone = parse_bucket_az(&bucket).context(ZoneSuffixSnafu { bucket: &bucket })?; + let zone = parse_bucket_az(&bucket).ok_or_else(|| { + let bucket = bucket.clone(); + Error::ZoneSuffix { bucket } + })?; // https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-Regions-and-Zones.html let endpoint = format!("https://{bucket}.s3express-{zone}.{region}.amazonaws.com"); diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 81015e82b39c..25fdd3311c95 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -56,7 +56,6 @@ use reqwest::{Client as ReqwestClient, Method, RequestBuilder, Response}; use ring::digest; use ring::digest::Context; use serde::{Deserialize, Serialize}; -use snafu::{ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-amz-version-id"; @@ -65,56 +64,56 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-amz-meta-"; const ALGORITHM: &str = "x-amz-checksum-algorithm"; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing DeleteObjects request: {}", source))] + #[error("Error performing DeleteObjects request: {}", source)] DeleteObjectsRequest { source: crate::client::retry::Error }, - #[snafu(display( + #[error( "DeleteObjects request failed for key {}: {} (code: {})", path, message, code - ))] + )] DeleteFailed { path: String, code: String, message: String, }, - #[snafu(display("Error getting DeleteObjects response body: {}", source))] + #[error("Error getting DeleteObjects response body: {}", source)] DeleteObjectsResponse { source: reqwest::Error }, - #[snafu(display("Got invalid DeleteObjects response: {}", source))] + #[error("Got invalid DeleteObjects response: {}", source)] InvalidDeleteObjectsResponse { source: Box, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Error getting create multipart response body: {}", source))] + #[error("Error getting create multipart response body: {}", source)] CreateMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Error performing complete multipart request: {}: {}", path, source))] + #[error("Error performing complete multipart request: {}: {}", path, source)] CompleteMultipartRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -263,10 +262,15 @@ impl SessionCredential<'_> { } } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum RequestError { - #[snafu(context(false))] - Generic { source: crate::Error }, + #[error(transparent)] + Generic { + #[from] + source: crate::Error, + }, + + #[error("Retry")] Retry { source: crate::client::retry::Error, path: String, @@ -426,12 +430,16 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RetrySnafu { path }) + .map_err(|source| { + let path = path.into(); + RequestError::Retry { source, path } + }) } pub(crate) async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -535,10 +543,10 @@ impl S3Client { .with_aws_sigv4(credential.authorizer(), Some(digest.as_ref())) .send_retry(&self.config.retry_config) .await - .context(DeleteObjectsRequestSnafu {})? + .map_err(|source| Error::DeleteObjectsRequest { source })? .bytes() .await - .context(DeleteObjectsResponseSnafu {})?; + .map_err(|source| Error::DeleteObjectsResponse { source })?; let response: BatchDeleteResponse = quick_xml::de::from_reader(response.reader()).map_err(|err| { @@ -635,10 +643,10 @@ impl S3Client { .await? .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; - let response: InitiateMultipartUploadResult = - quick_xml::de::from_reader(response.reader()).context(InvalidMultipartResponseSnafu)?; + let response: InitiateMultipartUploadResult = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(response.upload_id) } @@ -683,14 +691,14 @@ impl S3Client { .map(|v| v.to_string()); let e_tag = match is_copy { - false => get_etag(response.headers()).context(MetadataSnafu)?, + false => get_etag(response.headers()).map_err(|source| Error::Metadata { source })?, true => { let response = response .bytes() .await - .context(CreateMultipartResponseBodySnafu)?; + .map_err(|source| Error::CreateMultipartResponseBody { source })?; let response: CopyPartResult = quick_xml::de::from_reader(response.reader()) - .context(InvalidMultipartResponseSnafu)?; + .map_err(|source| Error::InvalidMultipartResponse { source })?; response.e_tag } }; @@ -764,19 +772,21 @@ impl S3Client { .retry_error_body(true) .send() .await - .context(CompleteMultipartRequestSnafu { - path: location.as_ref(), + .map_err(|source| Error::CompleteMultipartRequest { + source, + path: location.as_ref().to_string(), })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -884,13 +894,14 @@ impl ListClient for S3Client { .with_aws_sigv4(credential.authorizer(), None) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/aws/credential.rs b/object_store/src/aws/credential.rs index ee2f8e2ec953..9c74e1c6526a 100644 --- a/object_store/src/aws/credential.rs +++ b/object_store/src/aws/credential.rs @@ -29,23 +29,22 @@ use percent_encoding::utf8_percent_encode; use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION}; use reqwest::{Client, Method, Request, RequestBuilder, StatusCode}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::warn; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[allow(clippy::enum_variant_names)] enum Error { - #[snafu(display("Error performing CreateSession request: {source}"))] + #[error("Error performing CreateSession request: {source}")] CreateSessionRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting CreateSession response: {source}"))] + #[error("Error getting CreateSession response: {source}")] CreateSessionResponse { source: reqwest::Error }, - #[snafu(display("Invalid CreateSessionOutput response: {source}"))] + #[error("Invalid CreateSessionOutput response: {source}")] CreateSessionOutput { source: quick_xml::DeError }, } @@ -726,13 +725,13 @@ impl TokenProvider for SessionProvider { .with_aws_sigv4(Some(authorizer), None) .send_retry(retry) .await - .context(CreateSessionRequestSnafu)? + .map_err(|source| Error::CreateSessionRequest { source })? .bytes() .await - .context(CreateSessionResponseSnafu)?; + .map_err(|source| Error::CreateSessionResponse { source })?; - let resp: CreateSessionOutput = - quick_xml::de::from_reader(bytes.reader()).context(CreateSessionOutputSnafu)?; + let resp: CreateSessionOutput = quick_xml::de::from_reader(bytes.reader()) + .map_err(|source| Error::CreateSessionOutput { source })?; let creds = resp.credentials; Ok(TemporaryToken { diff --git a/object_store/src/aws/resolve.rs b/object_store/src/aws/resolve.rs index 25bc74f32f29..db899ea989e3 100644 --- a/object_store/src/aws/resolve.rs +++ b/object_store/src/aws/resolve.rs @@ -17,21 +17,20 @@ use crate::aws::STORE; use crate::{ClientOptions, Result}; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Bucket '{}' not found", bucket))] + #[error("Bucket '{}' not found", bucket)] BucketNotFound { bucket: String }, - #[snafu(display("Failed to resolve region for bucket '{}'", bucket))] + #[error("Failed to resolve region for bucket '{}'", bucket)] ResolveRegion { bucket: String, source: reqwest::Error, }, - #[snafu(display("Failed to parse the region for bucket '{}'", bucket))] + #[error("Failed to parse the region for bucket '{}'", bucket)] RegionParse { bucket: String }, } @@ -54,22 +53,23 @@ pub async fn resolve_bucket_region(bucket: &str, client_options: &ClientOptions) let client = client_options.client()?; - let response = client - .head(&endpoint) - .send() - .await - .context(ResolveRegionSnafu { bucket })?; + let response = client.head(&endpoint).send().await.map_err(|source| { + let bucket = bucket.into(); + Error::ResolveRegion { bucket, source } + })?; - ensure!( - response.status() != StatusCode::NOT_FOUND, - BucketNotFoundSnafu { bucket } - ); + if response.status() == StatusCode::NOT_FOUND { + let bucket = bucket.into(); + return Err(Error::BucketNotFound { bucket }.into()); + } let region = response .headers() .get("x-amz-bucket-region") .and_then(|x| x.to_str().ok()) - .context(RegionParseSnafu { bucket })?; + .ok_or_else(|| Error::RegionParse { + bucket: bucket.into(), + })?; Ok(region.to_string()) } diff --git a/object_store/src/azure/builder.rs b/object_store/src/azure/builder.rs index 08c9a232393d..f0572ebe6358 100644 --- a/object_store/src/azure/builder.rs +++ b/object_store/src/azure/builder.rs @@ -26,7 +26,6 @@ use crate::config::ConfigValue; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use percent_encoding::percent_decode_str; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use url::Url; @@ -45,48 +44,48 @@ const EMULATOR_ACCOUNT_KEY: &str = const MSI_ENDPOINT_ENV_KEY: &str = "IDENTITY_ENDPOINT"; /// A specialized `Error` for Azure builder-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unable parse emulator url {}={}, Error: {}", env_name, env_value, source - ))] + )] UnableToParseEmulatorUrl { env_name: String, env_value: String, source: url::ParseError, }, - #[snafu(display("Account must be specified"))] + #[error("Account must be specified")] MissingAccount {}, - #[snafu(display("Container name must be specified"))] + #[error("Container name must be specified")] MissingContainerName {}, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Failed parsing an SAS key"))] + #[error("Failed parsing an SAS key")] DecodeSasKey { source: std::str::Utf8Error }, - #[snafu(display("Missing component in SAS query pair"))] + #[error("Missing component in SAS query pair")] MissingSasComponent {}, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, } @@ -642,11 +641,17 @@ impl MicrosoftAzureBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| { + let url = url.into(); + Error::UnableToParseUrl { url, source } + })?; + + let host = parsed + .host_str() + .ok_or_else(|| Error::UrlNotRecognised { url: url.into() })?; let validate = |s: &str| match s.contains('.') { - true => Err(UrlNotRecognisedSnafu { url }.build()), + true => Err(Error::UrlNotRecognised { url: url.into() }), false => Ok(s.to_string()), }; @@ -665,7 +670,7 @@ impl MicrosoftAzureBuilder { self.account_name = Some(validate(a)?); self.use_fabric_endpoint = true.into(); } else { - return Err(UrlNotRecognisedSnafu { url }.build().into()); + return Err(Error::UrlNotRecognised { url: url.into() }.into()); } } "https" => match host.split_once('.') { @@ -689,9 +694,12 @@ impl MicrosoftAzureBuilder { } self.use_fabric_endpoint = true.into(); } - _ => return Err(UrlNotRecognisedSnafu { url }.build().into()), + _ => return Err(Error::UrlNotRecognised { url: url.into() }.into()), }, - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.into(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -924,8 +932,10 @@ impl MicrosoftAzureBuilder { }, }; - let url = - Url::parse(&account_url).context(UnableToParseUrlSnafu { url: account_url })?; + let url = Url::parse(&account_url).map_err(|source| { + let url = account_url.clone(); + Error::UnableToParseUrl { url, source } + })?; let credential = if let Some(credential) = self.credentials { credential @@ -1030,10 +1040,13 @@ impl MicrosoftAzureBuilder { /// if present, otherwise falls back to default_url fn url_from_env(env_name: &str, default_url: &str) -> Result { let url = match std::env::var(env_name) { - Ok(env_value) => Url::parse(&env_value).context(UnableToParseEmulatorUrlSnafu { - env_name, - env_value, - })?, + Ok(env_value) => { + Url::parse(&env_value).map_err(|source| Error::UnableToParseEmulatorUrl { + env_name: env_name.into(), + env_value, + source, + })? + } Err(_) => Url::parse(default_url).expect("Failed to parse default URL"), }; Ok(url) @@ -1042,7 +1055,7 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { fn split_sas(sas: &str) -> Result, Error> { let sas = percent_decode_str(sas) .decode_utf8() - .context(DecodeSasKeySnafu {})?; + .map_err(|source| Error::DecodeSasKey { source })?; let kv_str_pairs = sas .trim_start_matches('?') .split('&') diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 69ff39526bef..ea3a5faf3ad8 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -42,7 +42,6 @@ use reqwest::{ Client as ReqwestClient, Method, RequestBuilder, Response, }; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; @@ -60,84 +59,84 @@ static MS_CONTENT_LANGUAGE: HeaderName = HeaderName::from_static("x-ms-blob-cont static TAGS_HEADER: HeaderName = HeaderName::from_static("x-ms-tags"); /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing put request {}: {}", path, source))] + #[error("Error performing put request {}: {}", path, source)] PutRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing delete request {}: {}", path, source))] + #[error("Error performing delete request {}: {}", path, source)] DeleteRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing bulk delete request: {}", source))] + #[error("Error performing bulk delete request: {}", source)] BulkDeleteRequest { source: crate::client::retry::Error }, - #[snafu(display("Error receiving bulk delete request body: {}", source))] + #[error("Error receiving bulk delete request body: {}", source)] BulkDeleteRequestBody { source: reqwest::Error }, - #[snafu(display( + #[error( "Bulk delete request failed due to invalid input: {} (code: {})", reason, code - ))] + )] BulkDeleteRequestInvalidInput { code: String, reason: String }, - #[snafu(display("Got invalid bulk delete response: {}", reason))] + #[error("Got invalid bulk delete response: {}", reason)] InvalidBulkDeleteResponse { reason: String }, - #[snafu(display( + #[error( "Bulk delete request failed for key {}: {} (code: {})", path, reason, code - ))] + )] DeleteFailed { path: String, code: String, reason: String, }, - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("Error requesting user delegation key: {}", source))] + #[error("Error requesting user delegation key: {}", source)] DelegationKeyRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting user delegation key response body: {}", source))] + #[error("Error getting user delegation key response body: {}", source)] DelegationKeyResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid user delegation key response: {}", source))] + #[error("Got invalid user delegation key response: {}", source)] DelegationKeyResponse { source: quick_xml::de::DeError }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, - #[snafu(display("Generating SAS keys while skipping signatures is not supported"))] + #[error("Generating SAS keys while skipping signatures is not supported")] SASwithSkipSignature, } @@ -268,8 +267,9 @@ impl<'a> PutRequest<'a> { .payload(Some(self.payload)) .send() .await - .context(PutRequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::PutRequest { path, source } })?; Ok(response) @@ -544,13 +544,14 @@ impl AzureClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&IF_NONE_MATCH, "*"), PutMode::Update(v) => { - let etag = v.e_tag.as_ref().context(MissingETagSnafu)?; + let etag = v.e_tag.as_ref().ok_or(Error::MissingETag)?; builder.header(&IF_MATCH, etag) } }; let response = builder.header(&BLOB_TYPE, "BlockBlob").send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// PUT a block @@ -595,7 +596,8 @@ impl AzureClient { .send() .await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } /// Make an Azure Delete request @@ -620,8 +622,9 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(DeleteRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::DeleteRequest { source, path } })?; Ok(()) @@ -693,14 +696,14 @@ impl AzureClient { .with_azure_authorization(&credential, &self.config.account) .send_retry(&self.config.retry_config) .await - .context(BulkDeleteRequestSnafu {})?; + .map_err(|source| Error::BulkDeleteRequest { source })?; let boundary = parse_multipart_response_boundary(&batch_response)?; let batch_body = batch_response .bytes() .await - .context(BulkDeleteRequestBodySnafu {})?; + .map_err(|source| Error::BulkDeleteRequestBody { source })?; let results = parse_blob_batch_delete_body(batch_body, boundary, &paths).await?; @@ -780,13 +783,13 @@ impl AzureClient { .idempotent(true) .send() .await - .context(DelegationKeyRequestSnafu)? + .map_err(|source| Error::DelegationKeyRequest { source })? .bytes() .await - .context(DelegationKeyResponseBodySnafu)?; + .map_err(|source| Error::DelegationKeyResponseBody { source })?; - let response: UserDelegationKey = - quick_xml::de::from_reader(response.reader()).context(DelegationKeyResponseSnafu)?; + let response: UserDelegationKey = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::DelegationKeyResponse { source })?; Ok(response) } @@ -842,9 +845,11 @@ impl AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; + Ok(response) } } @@ -900,8 +905,9 @@ impl GetClient for AzureClient { .sensitive(sensitive) .send() .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; match response.headers().get("x-ms-resource-type") { @@ -962,13 +968,14 @@ impl ListClient for AzureClient { .sensitive(sensitive) .send() .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; + + let mut response: ListResultInternal = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; - let mut response: ListResultInternal = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; let token = response.next_marker.take(); Ok((to_list_result(response, prefix)?, token)) diff --git a/object_store/src/azure/credential.rs b/object_store/src/azure/credential.rs index 2832eed72256..c9e6ac640b4a 100644 --- a/object_store/src/azure/credential.rs +++ b/object_store/src/azure/credential.rs @@ -32,7 +32,6 @@ use reqwest::header::{ }; use reqwest::{Client, Method, Request, RequestBuilder}; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Debug; @@ -71,27 +70,27 @@ const AZURE_STORAGE_SCOPE: &str = "https://storage.azure.com/.default"; /// const AZURE_STORAGE_RESOURCE: &str = "https://storage.azure.com"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, - #[snafu(display("Error reading federated token file "))] + #[error("Error reading federated token file ")] FederatedTokenFile, - #[snafu(display("Invalid Access Key: {}", source))] + #[error("Invalid Access Key: {}", source)] InvalidAccessKey { source: base64::DecodeError }, - #[snafu(display("'az account get-access-token' command failed: {message}"))] + #[error("'az account get-access-token' command failed: {message}")] AzureCli { message: String }, - #[snafu(display("Failed to parse azure cli response: {source}"))] + #[error("Failed to parse azure cli response: {source}")] AzureCliResponse { source: serde_json::Error }, - #[snafu(display("Generating SAS keys with SAS tokens auth is not supported"))] + #[error("Generating SAS keys with SAS tokens auth is not supported")] SASforSASNotSupported, } @@ -113,7 +112,10 @@ pub struct AzureAccessKey(Vec); impl AzureAccessKey { /// Create a new [`AzureAccessKey`], checking it for validity pub fn try_new(key: &str) -> Result { - let key = BASE64_STANDARD.decode(key).context(InvalidAccessKeySnafu)?; + let key = BASE64_STANDARD + .decode(key) + .map_err(|source| Error::InvalidAccessKey { source })?; + Ok(Self(key)) } } @@ -636,10 +638,10 @@ impl TokenProvider for ClientSecretOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -744,10 +746,10 @@ impl TokenProvider for ImdsManagedIdentityProvider { let response: ImdsTokenResponse = builder .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -820,10 +822,10 @@ impl TokenProvider for WorkloadIdentityOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(AzureCredential::BearerToken(response.access_token)), @@ -900,7 +902,8 @@ impl AzureCliCredential { })?; let token_response = serde_json::from_str::(output) - .context(AzureCliResponseSnafu)?; + .map_err(|source| Error::AzureCliResponse { source })?; + if !token_response.token_type.eq_ignore_ascii_case("bearer") { return Err(Error::AzureCli { message: format!( @@ -1033,10 +1036,10 @@ impl TokenProvider for FabricTokenOAuthProvider { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; let exp_in = Self::validate_and_get_expiry(&access_token) .map_or(3600, |expiry| expiry - Self::get_current_timestamp()); Ok(TemporaryToken { diff --git a/object_store/src/client/get.rs b/object_store/src/client/get.rs index 5dd62cbece5a..57aca8956452 100644 --- a/object_store/src/client/get.rs +++ b/object_store/src/client/get.rs @@ -29,7 +29,6 @@ use hyper::header::{ use hyper::StatusCode; use reqwest::header::ToStrError; use reqwest::Response; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; /// A client that can perform a get request #[async_trait] @@ -95,49 +94,51 @@ impl ContentRange { } /// A specialized `Error` for get-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum GetResultError { - #[snafu(context(false))] + #[error(transparent)] Header { + #[from] source: crate::client::header::Error, }, - #[snafu(transparent)] + #[error(transparent)] InvalidRangeRequest { + #[from] source: crate::util::InvalidGetRange, }, - #[snafu(display("Received non-partial response when range requested"))] + #[error("Received non-partial response when range requested")] NotPartial, - #[snafu(display("Content-Range header not present in partial response"))] + #[error("Content-Range header not present in partial response")] NoContentRange, - #[snafu(display("Failed to parse value for CONTENT_RANGE header: \"{value}\""))] + #[error("Failed to parse value for CONTENT_RANGE header: \"{value}\"")] ParseContentRange { value: String }, - #[snafu(display("Content-Range header contained non UTF-8 characters"))] + #[error("Content-Range header contained non UTF-8 characters")] InvalidContentRange { source: ToStrError }, - #[snafu(display("Cache-Control header contained non UTF-8 characters"))] + #[error("Cache-Control header contained non UTF-8 characters")] InvalidCacheControl { source: ToStrError }, - #[snafu(display("Content-Disposition header contained non UTF-8 characters"))] + #[error("Content-Disposition header contained non UTF-8 characters")] InvalidContentDisposition { source: ToStrError }, - #[snafu(display("Content-Encoding header contained non UTF-8 characters"))] + #[error("Content-Encoding header contained non UTF-8 characters")] InvalidContentEncoding { source: ToStrError }, - #[snafu(display("Content-Language header contained non UTF-8 characters"))] + #[error("Content-Language header contained non UTF-8 characters")] InvalidContentLanguage { source: ToStrError }, - #[snafu(display("Content-Type header contained non UTF-8 characters"))] + #[error("Content-Type header contained non UTF-8 characters")] InvalidContentType { source: ToStrError }, - #[snafu(display("Metadata value for \"{key:?}\" contained non UTF-8 characters"))] + #[error("Metadata value for \"{key:?}\" contained non UTF-8 characters")] InvalidMetadata { key: String }, - #[snafu(display("Requested {expected:?}, got {actual:?}"))] + #[error("Requested {expected:?}, got {actual:?}")] UnexpectedRange { expected: Range, actual: Range, @@ -153,17 +154,24 @@ fn get_result( // ensure that we receive the range we asked for let range = if let Some(expected) = range { - ensure!( - response.status() == StatusCode::PARTIAL_CONTENT, - NotPartialSnafu - ); + if response.status() != StatusCode::PARTIAL_CONTENT { + return Err(GetResultError::NotPartial); + } + let val = response .headers() .get(CONTENT_RANGE) - .context(NoContentRangeSnafu)?; + .ok_or(GetResultError::NoContentRange)?; + + let value = val + .to_str() + .map_err(|source| GetResultError::InvalidContentRange { source })?; + + let value = ContentRange::from_str(value).ok_or_else(|| { + let value = value.into(); + GetResultError::ParseContentRange { value } + })?; - let value = val.to_str().context(InvalidContentRangeSnafu)?; - let value = ContentRange::from_str(value).context(ParseContentRangeSnafu { value })?; let actual = value.range; // Update size to reflect full size of object (#5272) @@ -171,10 +179,9 @@ fn get_result( let expected = expected.as_range(meta.size)?; - ensure!( - actual == expected, - UnexpectedRangeSnafu { expected, actual } - ); + if actual != expected { + return Err(GetResultError::UnexpectedRange { expected, actual }); + } actual } else { @@ -182,11 +189,11 @@ fn get_result( }; macro_rules! parse_attributes { - ($headers:expr, $(($header:expr, $attr:expr, $err:expr)),*) => {{ + ($headers:expr, $(($header:expr, $attr:expr, $map_err:expr)),*) => {{ let mut attributes = Attributes::new(); $( if let Some(x) = $headers.get($header) { - let x = x.to_str().context($err)?; + let x = x.to_str().map_err($map_err)?; attributes.insert($attr, x.to_string().into()); } )* @@ -196,31 +203,23 @@ fn get_result( let mut attributes = parse_attributes!( response.headers(), - ( - CACHE_CONTROL, - Attribute::CacheControl, - InvalidCacheControlSnafu - ), + (CACHE_CONTROL, Attribute::CacheControl, |source| { + GetResultError::InvalidCacheControl { source } + }), ( CONTENT_DISPOSITION, Attribute::ContentDisposition, - InvalidContentDispositionSnafu - ), - ( - CONTENT_ENCODING, - Attribute::ContentEncoding, - InvalidContentEncodingSnafu + |source| GetResultError::InvalidContentDisposition { source } ), - ( - CONTENT_LANGUAGE, - Attribute::ContentLanguage, - InvalidContentLanguageSnafu - ), - ( - CONTENT_TYPE, - Attribute::ContentType, - InvalidContentTypeSnafu - ) + (CONTENT_ENCODING, Attribute::ContentEncoding, |source| { + GetResultError::InvalidContentEncoding { source } + }), + (CONTENT_LANGUAGE, Attribute::ContentLanguage, |source| { + GetResultError::InvalidContentLanguage { source } + }), + (CONTENT_TYPE, Attribute::ContentType, |source| { + GetResultError::InvalidContentType { source } + }) ); // Add attributes that match the user-defined metadata prefix (e.g. x-amz-meta-) diff --git a/object_store/src/client/header.rs b/object_store/src/client/header.rs index 07c04c11945a..db06da6345d5 100644 --- a/object_store/src/client/header.rs +++ b/object_store/src/client/header.rs @@ -22,7 +22,6 @@ use crate::ObjectMeta; use chrono::{DateTime, TimeZone, Utc}; use hyper::header::{CONTENT_LENGTH, ETAG, LAST_MODIFIED}; use hyper::HeaderMap; -use snafu::{OptionExt, ResultExt, Snafu}; #[derive(Debug, Copy, Clone)] /// Configuration for header extraction @@ -44,27 +43,27 @@ pub(crate) struct HeaderConfig { pub user_defined_metadata_prefix: Option<&'static str>, } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("ETag Header missing from response"))] + #[error("ETag Header missing from response")] MissingEtag, - #[snafu(display("Received header containing non-ASCII data"))] + #[error("Received header containing non-ASCII data")] BadHeader { source: reqwest::header::ToStrError }, - #[snafu(display("Last-Modified Header missing from response"))] + #[error("Last-Modified Header missing from response")] MissingLastModified, - #[snafu(display("Content-Length Header missing from response"))] + #[error("Content-Length Header missing from response")] MissingContentLength, - #[snafu(display("Invalid last modified '{}': {}", last_modified, source))] + #[error("Invalid last modified '{}': {}", last_modified, source)] InvalidLastModified { last_modified: String, source: chrono::ParseError, }, - #[snafu(display("Invalid content length '{}': {}", content_length, source))] + #[error("Invalid content length '{}': {}", content_length, source)] InvalidContentLength { content_length: String, source: std::num::ParseIntError, @@ -86,7 +85,11 @@ pub(crate) fn get_put_result( #[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result, Error> { Ok(match headers.get(version) { - Some(x) => Some(x.to_str().context(BadHeaderSnafu)?.to_string()), + Some(x) => Some( + x.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }) } @@ -94,7 +97,10 @@ pub(crate) fn get_version(headers: &HeaderMap, version: &str) -> Result Result { let e_tag = headers.get(ETAG).ok_or(Error::MissingEtag)?; - Ok(e_tag.to_str().context(BadHeaderSnafu)?.to_string()) + Ok(e_tag + .to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string()) } /// Extracts [`ObjectMeta`] from the provided [`HeaderMap`] @@ -105,9 +111,15 @@ pub(crate) fn header_meta( ) -> Result { let last_modified = match headers.get(LAST_MODIFIED) { Some(last_modified) => { - let last_modified = last_modified.to_str().context(BadHeaderSnafu)?; + let last_modified = last_modified + .to_str() + .map_err(|source| Error::BadHeader { source })?; + DateTime::parse_from_rfc2822(last_modified) - .context(InvalidLastModifiedSnafu { last_modified })? + .map_err(|source| Error::InvalidLastModified { + last_modified: last_modified.into(), + source, + })? .with_timezone(&Utc) } None if cfg.last_modified_required => return Err(Error::MissingLastModified), @@ -122,15 +134,25 @@ pub(crate) fn header_meta( let content_length = headers .get(CONTENT_LENGTH) - .context(MissingContentLengthSnafu)?; + .ok_or(Error::MissingContentLength)?; + + let content_length = content_length + .to_str() + .map_err(|source| Error::BadHeader { source })?; - let content_length = content_length.to_str().context(BadHeaderSnafu)?; let size = content_length .parse() - .context(InvalidContentLengthSnafu { content_length })?; + .map_err(|source| Error::InvalidContentLength { + content_length: content_length.into(), + source, + })?; let version = match cfg.version_header.and_then(|h| headers.get(h)) { - Some(v) => Some(v.to_str().context(BadHeaderSnafu)?.to_string()), + Some(v) => Some( + v.to_str() + .map_err(|source| Error::BadHeader { source })? + .to_string(), + ), None => None, }; diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs index a8a8e58de4d0..8938b0861cca 100644 --- a/object_store/src/client/retry.rs +++ b/object_store/src/client/retry.rs @@ -22,30 +22,29 @@ use crate::PutPayload; use futures::future::BoxFuture; use reqwest::header::LOCATION; use reqwest::{Client, Request, Response, StatusCode}; -use snafu::Error as SnafuError; -use snafu::Snafu; +use std::error::Error as StdError; use std::time::{Duration, Instant}; use tracing::info; /// Retry request error -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Received redirect without LOCATION, this normally indicates an incorrectly configured region"))] + #[error("Received redirect without LOCATION, this normally indicates an incorrectly configured region")] BareRedirect, - #[snafu(display("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Server error, body contains Error, with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Server { status: StatusCode, body: Option, }, - #[snafu(display("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body")))] + #[error("Client error with status {status}: {}", body.as_deref().unwrap_or("No Body"))] Client { status: StatusCode, body: Option, }, - #[snafu(display("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}"))] + #[error("Error after {retries} retries in {elapsed:?}, max_retries:{max_retries}, retry_timeout:{retry_timeout:?}, source:{source}")] Reqwest { retries: usize, max_retries: usize, diff --git a/object_store/src/delimited.rs b/object_store/src/delimited.rs index 96f88bf41ff7..5b11a0bf7eb1 100644 --- a/object_store/src/delimited.rs +++ b/object_store/src/delimited.rs @@ -21,16 +21,15 @@ use std::collections::VecDeque; use bytes::Bytes; use futures::{Stream, StreamExt}; -use snafu::{ensure, Snafu}; use super::Result; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("encountered unterminated string"))] + #[error("encountered unterminated string")] UnterminatedString, - #[snafu(display("encountered trailing escape character"))] + #[error("encountered trailing escape character")] TrailingEscape, } @@ -125,8 +124,12 @@ impl LineDelimiter { /// Returns `true` if there is no remaining data to be read fn finish(&mut self) -> Result { if !self.remainder.is_empty() { - ensure!(!self.is_quote, UnterminatedStringSnafu); - ensure!(!self.is_escape, TrailingEscapeSnafu); + if self.is_quote { + Err(Error::UnterminatedString)?; + } + if self.is_escape { + Err(Error::TrailingEscape)?; + } self.complete .push_back(Bytes::from(std::mem::take(&mut self.remainder))) diff --git a/object_store/src/gcp/builder.rs b/object_store/src/gcp/builder.rs index fac923c4b9a0..cc5c1e1a0745 100644 --- a/object_store/src/gcp/builder.rs +++ b/object_store/src/gcp/builder.rs @@ -27,7 +27,6 @@ use crate::gcp::{ }; use crate::{ClientConfigKey, ClientOptions, Result, RetryConfig, StaticCredentialProvider}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -37,33 +36,33 @@ use super::credential::{AuthorizedUserSigningCredentials, InstanceSigningCredent const TOKEN_MIN_TTL: Duration = Duration::from_secs(4 * 60); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Missing bucket name"))] + #[error("Missing bucket name")] MissingBucketName {}, - #[snafu(display("One of service account path or service account key may be provided."))] + #[error("One of service account path or service account key may be provided.")] ServiceAccountPathAndKeyProvided, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display( + #[error( "Unknown url scheme cannot be parsed into storage location: {}", scheme - ))] + )] UnknownUrlScheme { scheme: String }, - #[snafu(display("URL did not match any known pattern for scheme: {}", url))] + #[error("URL did not match any known pattern for scheme: {}", url)] UrlNotRecognised { url: String }, - #[snafu(display("Configuration key: '{}' is not known.", key))] + #[error("Configuration key: '{}' is not known.", key)] UnknownConfigurationKey { key: String }, - #[snafu(display("GCP credential error: {}", source))] + #[error("GCP credential error: {}", source)] Credential { source: credential::Error }, } @@ -319,12 +318,21 @@ impl GoogleCloudStorageBuilder { /// This is a separate member function to allow fallible computation to /// be deferred until [`Self::build`] which in turn allows deriving [`Clone`] fn parse_url(&mut self, url: &str) -> Result<()> { - let parsed = Url::parse(url).context(UnableToParseUrlSnafu { url })?; - let host = parsed.host_str().context(UrlNotRecognisedSnafu { url })?; + let parsed = Url::parse(url).map_err(|source| Error::UnableToParseUrl { + source, + url: url.to_string(), + })?; + + let host = parsed.host_str().ok_or_else(|| Error::UrlNotRecognised { + url: url.to_string(), + })?; match parsed.scheme() { "gs" => self.bucket_name = Some(host.to_string()), - scheme => return Err(UnknownUrlSchemeSnafu { scheme }.build().into()), + scheme => { + let scheme = scheme.to_string(); + return Err(Error::UnknownUrlScheme { scheme }.into()); + } } Ok(()) } @@ -428,12 +436,14 @@ impl GoogleCloudStorageBuilder { // First try to initialize from the service account information. let service_account_credentials = match (self.service_account_path, self.service_account_key) { - (Some(path), None) => { - Some(ServiceAccountCredentials::from_file(path).context(CredentialSnafu)?) - } - (None, Some(key)) => { - Some(ServiceAccountCredentials::from_key(&key).context(CredentialSnafu)?) - } + (Some(path), None) => Some( + ServiceAccountCredentials::from_file(path) + .map_err(|source| Error::Credential { source })?, + ), + (None, Some(key)) => Some( + ServiceAccountCredentials::from_key(&key) + .map_err(|source| Error::Credential { source })?, + ), (None, None) => None, (Some(_), Some(_)) => return Err(Error::ServiceAccountPathAndKeyProvided.into()), }; diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index ccc9c341f2fe..1928d13b4739 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -44,7 +44,6 @@ use percent_encoding::{percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::header::HeaderName; use reqwest::{Client, Method, RequestBuilder, Response, StatusCode}; use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt, Snafu}; use std::sync::Arc; const VERSION_HEADER: &str = "x-goog-generation"; @@ -53,62 +52,62 @@ const USER_DEFINED_METADATA_HEADER_PREFIX: &str = "x-goog-meta-"; static VERSION_MATCH: HeaderName = HeaderName::from_static("x-goog-if-generation-match"); -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Error performing list request: {}", source))] + #[error("Error performing list request: {}", source)] ListRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting list response body: {}", source))] + #[error("Error getting list response body: {}", source)] ListResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid list response: {}", source))] + #[error("Got invalid list response: {}", source)] InvalidListResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error performing get request {}: {}", path, source))] + #[error("Error performing get request {}: {}", path, source)] GetRequest { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error performing request {}: {}", path, source))] + #[error("Error performing request {}: {}", path, source)] Request { source: crate::client::retry::Error, path: String, }, - #[snafu(display("Error getting put response body: {}", source))] + #[error("Error getting put response body: {}", source)] PutResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid put request: {}", source))] + #[error("Got invalid put request: {}", source)] InvalidPutRequest { source: quick_xml::se::SeError }, - #[snafu(display("Got invalid put response: {}", source))] + #[error("Got invalid put response: {}", source)] InvalidPutResponse { source: quick_xml::de::DeError }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, - #[snafu(display("Version required for conditional update"))] + #[error("Version required for conditional update")] MissingVersion, - #[snafu(display("Error performing complete multipart request: {}", source))] + #[error("Error performing complete multipart request: {}", source)] CompleteMultipartRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting complete multipart response body: {}", source))] + #[error("Error getting complete multipart response body: {}", source)] CompleteMultipartResponseBody { source: reqwest::Error }, - #[snafu(display("Got invalid multipart response: {}", source))] + #[error("Got invalid multipart response: {}", source)] InvalidMultipartResponse { source: quick_xml::de::DeError }, - #[snafu(display("Error signing blob: {}", source))] + #[error("Error signing blob: {}", source)] SignBlobRequest { source: crate::client::retry::Error }, - #[snafu(display("Got invalid signing blob response: {}", source))] + #[error("Got invalid signing blob response: {}", source)] InvalidSignBlobResponse { source: reqwest::Error }, - #[snafu(display("Got invalid signing blob signature: {}", source))] + #[error("Got invalid signing blob signature: {}", source)] InvalidSignBlobSignature { source: base64::DecodeError }, } @@ -236,15 +235,17 @@ impl<'a> Request<'a> { .payload(self.payload) .send() .await - .context(RequestSnafu { - path: self.path.as_ref(), + .map_err(|source| { + let path = self.path.as_ref().into(); + Error::Request { source, path } })?; Ok(resp) } async fn do_put(self) -> Result { let response = self.send().await?; - Ok(get_put_result(response.headers(), VERSION_HEADER).context(MetadataSnafu)?) + Ok(get_put_result(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?) } } @@ -336,17 +337,17 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(SignBlobRequestSnafu)?; + .map_err(|source| Error::SignBlobRequest { source })?; //If successful, the signature is returned in the signedBlob field in the response. let response = response .json::() .await - .context(InvalidSignBlobResponseSnafu)?; + .map_err(|source| Error::InvalidSignBlobResponse { source })?; let signed_blob = BASE64_STANDARD .decode(response.signed_blob) - .context(InvalidSignBlobSignatureSnafu)?; + .map_err(|source| Error::InvalidSignBlobSignature { source })?; Ok(hex_encode(&signed_blob)) } @@ -389,7 +390,7 @@ impl GoogleCloudStorageClient { PutMode::Overwrite => builder.idempotent(true), PutMode::Create => builder.header(&VERSION_MATCH, "0"), PutMode::Update(v) => { - let etag = v.version.as_ref().context(MissingVersionSnafu)?; + let etag = v.version.as_ref().ok_or(Error::MissingVersion)?; builder.header(&VERSION_MATCH, etag) } }; @@ -443,9 +444,14 @@ impl GoogleCloudStorageClient { .send() .await?; - let data = response.bytes().await.context(PutResponseBodySnafu)?; + let data = response + .bytes() + .await + .map_err(|source| Error::PutResponseBody { source })?; + let result: InitiateMultipartUploadResult = - quick_xml::de::from_reader(data.as_ref().reader()).context(InvalidPutResponseSnafu)?; + quick_xml::de::from_reader(data.as_ref().reader()) + .map_err(|source| Error::InvalidPutResponse { source })?; Ok(result.upload_id) } @@ -467,8 +473,9 @@ impl GoogleCloudStorageClient { .query(&[("uploadId", multipart_id)]) .send_retry(&self.config.retry_config) .await - .context(RequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::Request { source, path } })?; Ok(()) @@ -498,7 +505,7 @@ impl GoogleCloudStorageClient { let credential = self.get_credential().await?; let data = quick_xml::se::to_string(&upload_info) - .context(InvalidPutRequestSnafu)? + .map_err(|source| Error::InvalidPutRequest { source })? // We cannot disable the escaping that transforms "/" to ""e;" :( // https://github.com/tafia/quick-xml/issues/362 // https://github.com/tafia/quick-xml/issues/350 @@ -514,17 +521,18 @@ impl GoogleCloudStorageClient { .idempotent(true) .send() .await - .context(CompleteMultipartRequestSnafu)?; + .map_err(|source| Error::CompleteMultipartRequest { source })?; - let version = get_version(response.headers(), VERSION_HEADER).context(MetadataSnafu)?; + let version = get_version(response.headers(), VERSION_HEADER) + .map_err(|source| Error::Metadata { source })?; let data = response .bytes() .await - .context(CompleteMultipartResponseBodySnafu)?; + .map_err(|source| Error::CompleteMultipartResponseBody { source })?; - let response: CompleteMultipartUploadResult = - quick_xml::de::from_reader(data.reader()).context(InvalidMultipartResponseSnafu)?; + let response: CompleteMultipartUploadResult = quick_xml::de::from_reader(data.reader()) + .map_err(|source| Error::InvalidMultipartResponse { source })?; Ok(PutResult { e_tag: Some(response.e_tag), @@ -615,8 +623,9 @@ impl GetClient for GoogleCloudStorageClient { .with_get_options(options) .send_retry(&self.config.retry_config) .await - .context(GetRequestSnafu { - path: path.as_ref(), + .map_err(|source| { + let path = path.as_ref().into(); + Error::GetRequest { source, path } })?; Ok(response) @@ -665,13 +674,13 @@ impl ListClient for GoogleCloudStorageClient { .bearer_auth(&credential.bearer) .send_retry(&self.config.retry_config) .await - .context(ListRequestSnafu)? + .map_err(|source| Error::ListRequest { source })? .bytes() .await - .context(ListResponseBodySnafu)?; + .map_err(|source| Error::ListResponseBody { source })?; - let mut response: ListResponse = - quick_xml::de::from_reader(response.reader()).context(InvalidListResponseSnafu)?; + let mut response: ListResponse = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidListResponse { source })?; let token = response.next_continuation_token.take(); Ok((response.try_into()?, token)) diff --git a/object_store/src/gcp/credential.rs b/object_store/src/gcp/credential.rs index 155a80b343b2..4b21ad1d3eab 100644 --- a/object_store/src/gcp/credential.rs +++ b/object_store/src/gcp/credential.rs @@ -33,7 +33,6 @@ use percent_encoding::utf8_percent_encode; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use serde::Deserialize; -use snafu::{ResultExt, Snafu}; use std::collections::BTreeMap; use std::env; use std::fs::File; @@ -54,36 +53,39 @@ const DEFAULT_GCS_SIGN_BLOB_HOST: &str = "storage.googleapis.com"; const DEFAULT_METADATA_HOST: &str = "metadata.google.internal"; const DEFAULT_METADATA_IP: &str = "169.254.169.254"; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to open service account file from {}: {}", path.display(), source))] + #[error("Unable to open service account file from {}: {}", path.display(), source)] OpenCredentials { source: std::io::Error, path: PathBuf, }, - #[snafu(display("Unable to decode service account file: {}", source))] + #[error("Unable to decode service account file: {}", source)] DecodeCredentials { source: serde_json::Error }, - #[snafu(display("No RSA key found in pem file"))] + #[error("No RSA key found in pem file")] MissingKey, - #[snafu(display("Invalid RSA key: {}", source), context(false))] - InvalidKey { source: ring::error::KeyRejected }, + #[error("Invalid RSA key: {}", source)] + InvalidKey { + #[from] + source: ring::error::KeyRejected, + }, - #[snafu(display("Error signing: {}", source))] + #[error("Error signing: {}", source)] Sign { source: ring::error::Unspecified }, - #[snafu(display("Error encoding jwt payload: {}", source))] + #[error("Error encoding jwt payload: {}", source)] Encode { source: serde_json::Error }, - #[snafu(display("Unsupported key encoding: {}", encoding))] + #[error("Unsupported key encoding: {}", encoding)] UnsupportedKey { encoding: String }, - #[snafu(display("Error performing token request: {}", source))] + #[error("Error performing token request: {}", source)] TokenRequest { source: crate::client::retry::Error }, - #[snafu(display("Error getting token response body: {}", source))] + #[error("Error getting token response body: {}", source)] TokenResponseBody { source: reqwest::Error }, } @@ -153,7 +155,7 @@ impl ServiceAccountKey { string_to_sign.as_bytes(), &mut signature, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; Ok(hex_encode(&signature)) } @@ -289,7 +291,7 @@ impl TokenProvider for SelfSignedJwt { message.as_bytes(), &mut sig_bytes, ) - .context(SignSnafu)?; + .map_err(|source| Error::Sign { source })?; let signature = BASE64_URL_SAFE_NO_PAD.encode(sig_bytes); let bearer = [message, signature].join("."); @@ -305,11 +307,12 @@ fn read_credentials_file(service_account_path: impl AsRef) - where T: serde::de::DeserializeOwned, { - let file = File::open(&service_account_path).context(OpenCredentialsSnafu { - path: service_account_path.as_ref().to_owned(), + let file = File::open(&service_account_path).map_err(|source| { + let path = service_account_path.as_ref().to_owned(); + Error::OpenCredentials { source, path } })?; let reader = BufReader::new(file); - serde_json::from_reader(reader).context(DecodeCredentialsSnafu) + serde_json::from_reader(reader).map_err(|source| Error::DecodeCredentials { source }) } /// A deserialized `service-account-********.json`-file. @@ -341,7 +344,7 @@ impl ServiceAccountCredentials { /// Create a new [`ServiceAccountCredentials`] from a string. pub(crate) fn from_key(key: &str) -> Result { - serde_json::from_str(key).context(DecodeCredentialsSnafu) + serde_json::from_str(key).map_err(|source| Error::DecodeCredentials { source }) } /// Create a [`SelfSignedJwt`] from this credentials struct. @@ -380,7 +383,7 @@ fn seconds_since_epoch() -> u64 { } fn b64_encode_obj(obj: &T) -> Result { - let string = serde_json::to_string(obj).context(EncodeSnafu)?; + let string = serde_json::to_string(obj).map_err(|source| Error::Encode { source })?; Ok(BASE64_URL_SAFE_NO_PAD.encode(string)) } @@ -404,10 +407,10 @@ async fn make_metadata_request( .query(&[("audience", "https://www.googleapis.com/oauth2/v4/token")]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -467,10 +470,10 @@ async fn make_metadata_request_for_email( .header("Metadata-Flavor", "Google") .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .text() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response) } @@ -608,10 +611,10 @@ impl AuthorizedUserSigningCredentials { .query(&[("access_token", &self.credential.refresh_token)]) .send_retry(retry) .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(response.email) } @@ -659,10 +662,10 @@ impl TokenProvider for AuthorizedUserCredentials { .idempotent(true) .send() .await - .context(TokenRequestSnafu)? + .map_err(|source| Error::TokenRequest { source })? .json::() .await - .context(TokenResponseBodySnafu)?; + .map_err(|source| Error::TokenResponseBody { source })?; Ok(TemporaryToken { token: Arc::new(GcpCredential { diff --git a/object_store/src/http/client.rs b/object_store/src/http/client.rs index eeb7e5694228..41e6464c1999 100644 --- a/object_store/src/http/client.rs +++ b/object_store/src/http/client.rs @@ -32,42 +32,41 @@ use hyper::header::{ use percent_encoding::percent_decode_str; use reqwest::{Method, Response, StatusCode}; use serde::Deserialize; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Request { source: retry::Error }, - #[snafu(display("Request error: {}", source))] + #[error("Request error: {}", source)] Reqwest { source: reqwest::Error }, - #[snafu(display("Range request not supported by {}", href))] + #[error("Range request not supported by {}", href)] RangeNotSupported { href: String }, - #[snafu(display("Error decoding PROPFIND response: {}", source))] + #[error("Error decoding PROPFIND response: {}", source)] InvalidPropFind { source: quick_xml::de::DeError }, - #[snafu(display("Missing content size for {}", href))] + #[error("Missing content size for {}", href)] MissingSize { href: String }, - #[snafu(display("Error getting properties of \"{}\" got \"{}\"", href, status))] + #[error("Error getting properties of \"{}\" got \"{}\"", href, status)] PropStatus { href: String, status: String }, - #[snafu(display("Failed to parse href \"{}\": {}", href, source))] + #[error("Failed to parse href \"{}\": {}", href, source)] InvalidHref { href: String, source: url::ParseError, }, - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { path: String, source: std::str::Utf8Error, }, - #[snafu(display("Encountered invalid path \"{}\": {}", path, source))] + #[error("Encountered invalid path \"{}\": {}", path, source)] InvalidPath { path: String, source: crate::path::Error, @@ -129,7 +128,7 @@ impl Client { .request(method, url) .send_retry(&self.retry_config) .await - .context(RequestSnafu)?; + .map_err(|source| Error::Request { source })?; Ok(()) } @@ -236,7 +235,10 @@ impl Client { .await; let response = match result { - Ok(result) => result.bytes().await.context(ReqwestSnafu)?, + Ok(result) => result + .bytes() + .await + .map_err(|source| Error::Reqwest { source })?, Err(e) if matches!(e.status(), Some(StatusCode::NOT_FOUND)) => { return match depth { "0" => { @@ -255,7 +257,9 @@ impl Client { Err(source) => return Err(Error::Request { source }.into()), }; - let status = quick_xml::de::from_reader(response.reader()).context(InvalidPropFindSnafu)?; + let status = quick_xml::de::from_reader(response.reader()) + .map_err(|source| Error::InvalidPropFind { source })?; + Ok(status) } @@ -397,14 +401,23 @@ impl MultiStatusResponse { let url = Url::options() .base_url(Some(base_url)) .parse(&self.href) - .context(InvalidHrefSnafu { href: &self.href })?; + .map_err(|source| Error::InvalidHref { + href: self.href.clone(), + source, + })?; // Reverse any percent encoding let path = percent_decode_str(url.path()) .decode_utf8() - .context(NonUnicodeSnafu { path: url.path() })?; + .map_err(|source| Error::NonUnicode { + path: url.path().into(), + source, + })?; - Ok(Path::parse(path.as_ref()).context(InvalidPathSnafu { path })?) + Ok(Path::parse(path.as_ref()).map_err(|source| { + let path = path.into(); + Error::InvalidPath { path, source } + })?) } fn size(&self) -> Result { @@ -412,7 +425,10 @@ impl MultiStatusResponse { .prop_stat .prop .content_length - .context(MissingSizeSnafu { href: &self.href })?; + .ok_or_else(|| Error::MissingSize { + href: self.href.clone(), + })?; + Ok(size) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 4b1c927e74f5..417f72856722 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -35,7 +35,6 @@ use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; -use snafu::{OptionExt, ResultExt, Snafu}; use url::Url; use crate::client::get::GetClientExt; @@ -49,18 +48,18 @@ use crate::{ mod client; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("Must specify a URL"))] + #[error("Must specify a URL")] MissingUrl, - #[snafu(display("Unable parse source url. Url: {}, Error: {}", url, source))] + #[error("Unable parse source url. Url: {}, Error: {}", url, source)] UnableToParseUrl { source: url::ParseError, url: String, }, - #[snafu(display("Unable to extract metadata from headers: {}", source))] + #[error("Unable to extract metadata from headers: {}", source)] Metadata { source: crate::client::header::Error, }, @@ -235,8 +234,8 @@ impl HttpBuilder { /// Build an [`HttpStore`] with the configured options pub fn build(self) -> Result { - let url = self.url.context(MissingUrlSnafu)?; - let parsed = Url::parse(&url).context(UnableToParseUrlSnafu { url })?; + let url = self.url.ok_or(Error::MissingUrl)?; + let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { client: Client::new(parsed, self.client_options, self.retry_config)?, diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 6f5733226922..987ffacc6e49 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -566,7 +566,6 @@ use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; -use snafu::Snafu; use std::fmt::{Debug, Formatter}; #[cfg(all(feature = "fs", not(target_arch = "wasm32")))] use std::io::{Read, Seek, SeekFrom}; @@ -1229,11 +1228,11 @@ pub struct PutResult { pub type Result = std::result::Result; /// A specialized `Error` for object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// A fallback error type when no variant matches - #[snafu(display("Generic {} error: {}", store, source))] + #[error("Generic {} error: {}", store, source)] Generic { /// The store this error originated from store: &'static str, @@ -1242,7 +1241,7 @@ pub enum Error { }, /// Error when the object is not found at given location - #[snafu(display("Object at location {} not found: {}", path, source))] + #[error("Object at location {} not found: {}", path, source)] NotFound { /// The path to file path: String, @@ -1251,31 +1250,30 @@ pub enum Error { }, /// Error for invalid path - #[snafu( - display("Encountered object with invalid path: {}", source), - context(false) - )] + #[error("Encountered object with invalid path: {}", source)] InvalidPath { /// The wrapped error + #[from] source: path::Error, }, /// Error when `tokio::spawn` failed - #[snafu(display("Error joining spawned task: {}", source), context(false))] + #[error("Error joining spawned task: {}", source)] JoinError { /// The wrapped error + #[from] source: tokio::task::JoinError, }, /// Error when the attempted operation is not supported - #[snafu(display("Operation not supported: {}", source))] + #[error("Operation not supported: {}", source)] NotSupported { /// The wrapped error source: Box, }, /// Error when the object already exists - #[snafu(display("Object at location {} already exists: {}", path, source))] + #[error("Object at location {} already exists: {}", path, source)] AlreadyExists { /// The path to the path: String, @@ -1284,7 +1282,7 @@ pub enum Error { }, /// Error when the required conditions failed for the operation - #[snafu(display("Request precondition failure for path {}: {}", path, source))] + #[error("Request precondition failure for path {}: {}", path, source)] Precondition { /// The path to the file path: String, @@ -1293,7 +1291,7 @@ pub enum Error { }, /// Error when the object at the location isn't modified - #[snafu(display("Object at location {} not modified: {}", path, source))] + #[error("Object at location {} not modified: {}", path, source)] NotModified { /// The path to the file path: String, @@ -1302,16 +1300,16 @@ pub enum Error { }, /// Error when an operation is not implemented - #[snafu(display("Operation not yet implemented."))] + #[error("Operation not yet implemented.")] NotImplemented, /// Error when the used credentials don't have enough permission /// to perform the requested operation - #[snafu(display( + #[error( "The operation lacked the necessary privileges to complete for path {}: {}", path, source - ))] + )] PermissionDenied { /// The path to the file path: String, @@ -1320,11 +1318,11 @@ pub enum Error { }, /// Error when the used credentials lack valid authentication - #[snafu(display( + #[error( "The operation lacked valid authentication credentials for path {}: {}", path, source - ))] + )] Unauthenticated { /// The path to the file path: String, @@ -1333,7 +1331,7 @@ pub enum Error { }, /// Error when a configuration key is invalid for the store used - #[snafu(display("Configuration key: '{}' is not valid for store '{}'.", key, store))] + #[error("Configuration key: '{}' is not valid for store '{}'.", key, store)] UnknownConfigurationKey { /// The object store used store: &'static str, diff --git a/object_store/src/local.rs b/object_store/src/local.rs index 78fce9c26224..b193481ae7b8 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -30,7 +30,6 @@ use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use futures::{FutureExt, TryStreamExt}; use parking_lot::Mutex; -use snafu::{ensure, OptionExt, ResultExt, Snafu}; use url::Url; use walkdir::{DirEntry, WalkDir}; @@ -43,117 +42,80 @@ use crate::{ }; /// A specialized `Error` for filesystem object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum Error { - #[snafu(display("File size for {} did not fit in a usize: {}", path, source))] + #[error("File size for {} did not fit in a usize: {}", path, source)] FileSizeOverflowedUsize { source: std::num::TryFromIntError, path: String, }, - #[snafu(display("Unable to walk dir: {}", source))] - UnableToWalkDir { - source: walkdir::Error, - }, + #[error("Unable to walk dir: {}", source)] + UnableToWalkDir { source: walkdir::Error }, - #[snafu(display("Unable to access metadata for {}: {}", path, source))] + #[error("Unable to access metadata for {}: {}", path, source)] Metadata { source: Box, path: String, }, - #[snafu(display("Unable to copy data to file: {}", source))] - UnableToCopyDataToFile { - source: io::Error, - }, + #[error("Unable to copy data to file: {}", source)] + UnableToCopyDataToFile { source: io::Error }, - #[snafu(display("Unable to rename file: {}", source))] - UnableToRenameFile { - source: io::Error, - }, + #[error("Unable to rename file: {}", source)] + UnableToRenameFile { source: io::Error }, - #[snafu(display("Unable to create dir {}: {}", path.display(), source))] - UnableToCreateDir { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create dir {}: {}", path.display(), source)] + UnableToCreateDir { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to create file {}: {}", path.display(), source))] - UnableToCreateFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to create file {}: {}", path.display(), source)] + UnableToCreateFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to delete file {}: {}", path.display(), source))] - UnableToDeleteFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to delete file {}: {}", path.display(), source)] + UnableToDeleteFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to open file {}: {}", path.display(), source))] - UnableToOpenFile { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to open file {}: {}", path.display(), source)] + UnableToOpenFile { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] - UnableToReadBytes { - source: io::Error, - path: PathBuf, - }, + #[error("Unable to read data from file {}: {}", path.display(), source)] + UnableToReadBytes { source: io::Error, path: PathBuf }, - #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] + #[error("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual)] OutOfRange { path: PathBuf, expected: usize, actual: usize, }, - #[snafu(display("Requested range was invalid"))] - InvalidRange { - source: InvalidGetRange, - }, + #[error("Requested range was invalid")] + InvalidRange { source: InvalidGetRange }, - #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] + #[error("Unable to copy file from {} to {}: {}", from.display(), to.display(), source)] UnableToCopyFile { from: PathBuf, to: PathBuf, source: io::Error, }, - NotFound { - path: PathBuf, - source: io::Error, - }, + #[error("NotFound")] + NotFound { path: PathBuf, source: io::Error }, - #[snafu(display("Error seeking file {}: {}", path.display(), source))] - Seek { - source: io::Error, - path: PathBuf, - }, + #[error("Error seeking file {}: {}", path.display(), source)] + Seek { source: io::Error, path: PathBuf }, - #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] - InvalidUrl { - url: Url, - }, + #[error("Unable to convert URL \"{}\" to filesystem path", url)] + InvalidUrl { url: Url }, - AlreadyExists { - path: String, - source: io::Error, - }, + #[error("AlreadyExists")] + AlreadyExists { path: String, source: io::Error }, - #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] - UnableToCanonicalize { - path: PathBuf, - source: io::Error, - }, + #[error("Unable to canonicalize filesystem root: {}", path.display())] + UnableToCanonicalize { path: PathBuf, source: io::Error }, - #[snafu(display("Filenames containing trailing '/#\\d+/' are not supported: {}", path))] - InvalidPath { - path: String, - }, + #[error("Filenames containing trailing '/#\\d+/' are not supported: {}", path)] + InvalidPath { path: String }, - #[snafu(display("Upload aborted"))] + #[error("Upload aborted")] Aborted, } @@ -276,8 +238,9 @@ impl LocalFileSystem { /// Returns an error if the path does not exist /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { - let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { - path: prefix.as_ref(), + let path = std::fs::canonicalize(&prefix).map_err(|source| { + let path = prefix.as_ref().into(); + Error::UnableToCanonicalize { source, path } })?; Ok(Self { @@ -290,12 +253,12 @@ impl LocalFileSystem { /// Return an absolute filesystem path of the given file location pub fn path_to_filesystem(&self, location: &Path) -> Result { - ensure!( - is_valid_file_path(location), - InvalidPathSnafu { - path: location.as_ref() - } - ); + if !is_valid_file_path(location) { + let path = location.as_ref().into(); + let error = Error::InvalidPath { path }; + return Err(error.into()); + } + let path = self.config.prefix_to_filesystem(location)?; #[cfg(target_os = "windows")] @@ -451,7 +414,9 @@ impl ObjectStore for LocalFileSystem { options.check_preconditions(&meta)?; let range = match options.range { - Some(r) => r.as_range(meta.size).context(InvalidRangeSnafu)?, + Some(r) => r + .as_range(meta.size) + .map_err(|source| Error::InvalidRange { source })?, None => 0..meta.size, }; @@ -721,12 +686,15 @@ impl ObjectStore for LocalFileSystem { /// Creates the parent directories of `path` or returns an error based on `source` if no parent fn create_parent_dirs(path: &std::path::Path, source: io::Error) -> Result<()> { - let parent = path.parent().ok_or_else(|| Error::UnableToCreateFile { - path: path.to_path_buf(), - source, + let parent = path.parent().ok_or_else(|| { + let path = path.to_path_buf(); + Error::UnableToCreateFile { path, source } })?; - std::fs::create_dir_all(parent).context(UnableToCreateDirSnafu { path: parent })?; + std::fs::create_dir_all(parent).map_err(|source| { + let path = parent.into(); + Error::UnableToCreateDir { source, path } + })?; Ok(()) } @@ -796,12 +764,14 @@ impl MultipartUpload for LocalUpload { let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { let mut file = s.file.lock(); - file.seek(SeekFrom::Start(offset)) - .context(SeekSnafu { path: &s.dest })?; + file.seek(SeekFrom::Start(offset)).map_err(|source| { + let path = s.dest.clone(); + Error::Seek { source, path } + })?; data.iter() .try_for_each(|x| file.write_all(x)) - .context(UnableToCopyDataToFileSnafu)?; + .map_err(|source| Error::UnableToCopyDataToFile { source })?; Ok(()) }) @@ -809,12 +779,13 @@ impl MultipartUpload for LocalUpload { } async fn complete(&mut self) -> Result { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; let s = Arc::clone(&self.state); maybe_spawn_blocking(move || { // Ensure no inflight writes let file = s.file.lock(); - std::fs::rename(&src, &s.dest).context(UnableToRenameFileSnafu)?; + std::fs::rename(&src, &s.dest) + .map_err(|source| Error::UnableToRenameFile { source })?; let metadata = file.metadata().map_err(|e| Error::Metadata { source: e.into(), path: src.to_string_lossy().to_string(), @@ -829,9 +800,10 @@ impl MultipartUpload for LocalUpload { } async fn abort(&mut self) -> Result<()> { - let src = self.src.take().context(AbortedSnafu)?; + let src = self.src.take().ok_or(Error::Aborted)?; maybe_spawn_blocking(move || { - std::fs::remove_file(&src).context(UnableToDeleteFileSnafu { path: &src })?; + std::fs::remove_file(&src) + .map_err(|source| Error::UnableToDeleteFile { source, path: src })?; Ok(()) }) .await @@ -898,22 +870,30 @@ pub(crate) fn chunked_stream( pub(crate) fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { let to_read = range.end - range.start; file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::Seek { source, path } + })?; let mut buf = Vec::with_capacity(to_read); let read = file .take(to_read as u64) .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::UnableToReadBytes { source, path } + })?; - ensure!( - read == to_read, - OutOfRangeSnafu { - path, + if read != to_read { + let error = Error::OutOfRange { + path: path.into(), expected: to_read, - actual: read - } - ); + actual: read, + }; + + return Err(error.into()); + } + Ok(buf.into()) } @@ -982,8 +962,9 @@ fn get_etag(metadata: &Metadata) -> String { fn convert_metadata(metadata: Metadata, location: Path) -> Result { let last_modified = last_modified(&metadata); - let size = usize::try_from(metadata.len()).context(FileSizeOverflowedUsizeSnafu { - path: location.as_ref(), + let size = usize::try_from(metadata.len()).map_err(|source| { + let path = location.as_ref().into(); + Error::FileSizeOverflowedUsize { source, path } })?; Ok(ObjectMeta { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index a467e3b88a26..3f3cff3390db 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -25,7 +25,6 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use futures::{stream::BoxStream, StreamExt}; use parking_lot::RwLock; -use snafu::{OptionExt, ResultExt, Snafu}; use crate::multipart::{MultipartStore, PartId}; use crate::util::InvalidGetRange; @@ -37,24 +36,24 @@ use crate::{ use crate::{GetOptions, PutPayload}; /// A specialized `Error` for in-memory object store-related errors -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] enum Error { - #[snafu(display("No data in memory found. Location: {path}"))] + #[error("No data in memory found. Location: {path}")] NoDataInMemory { path: String }, - #[snafu(display("Invalid range: {source}"))] + #[error("Invalid range: {source}")] Range { source: InvalidGetRange }, - #[snafu(display("Object already exists at that location: {path}"))] + #[error("Object already exists at that location: {path}")] AlreadyExists { path: String }, - #[snafu(display("ETag required for conditional update"))] + #[error("ETag required for conditional update")] MissingETag, - #[snafu(display("MultipartUpload not found: {id}"))] + #[error("MultipartUpload not found: {id}")] UploadNotFound { id: String }, - #[snafu(display("Missing part at index: {part}"))] + #[error("Missing part at index: {part}")] MissingPart { part: usize }, } @@ -158,7 +157,7 @@ impl Storage { }), Some(e) => { let existing = e.e_tag.to_string(); - let expected = v.e_tag.context(MissingETagSnafu)?; + let expected = v.e_tag.ok_or(Error::MissingETag)?; if existing == expected { *e = entry; Ok(()) @@ -177,7 +176,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.get_mut(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } @@ -186,7 +185,7 @@ impl Storage { .parse() .ok() .and_then(|x| self.uploads.remove(&x)) - .context(UploadNotFoundSnafu { id })?; + .ok_or_else(|| Error::UploadNotFound { id: id.into() })?; Ok(parts) } } @@ -250,7 +249,9 @@ impl ObjectStore for InMemory { let (range, data) = match options.range { Some(range) => { - let r = range.as_range(entry.data.len()).context(RangeSnafu)?; + let r = range + .as_range(entry.data.len()) + .map_err(|source| Error::Range { source })?; (r.clone(), entry.data.slice(r)) } None => (0..entry.data.len(), entry.data), @@ -272,7 +273,7 @@ impl ObjectStore for InMemory { .map(|range| { let r = GetRange::Bounded(range.clone()) .as_range(entry.data.len()) - .context(RangeSnafu)?; + .map_err(|source| Error::Range { source })?; Ok(entry.data.slice(r)) }) @@ -435,7 +436,7 @@ impl MultipartStore for InMemory { let mut cap = 0; for (part, x) in upload.parts.iter().enumerate() { - cap += x.as_ref().context(MissingPartSnafu { part })?.len(); + cap += x.as_ref().ok_or(Error::MissingPart { part })?.len(); } let mut buf = Vec::with_capacity(cap); for x in &upload.parts { @@ -474,7 +475,7 @@ impl InMemory { .map .get(location) .cloned() - .context(NoDataInMemorySnafu { + .ok_or_else(|| Error::NoDataInMemory { path: location.to_string(), })?; diff --git a/object_store/src/parse.rs b/object_store/src/parse.rs index a3919305281d..bc65a0b8d1c8 100644 --- a/object_store/src/parse.rs +++ b/object_store/src/parse.rs @@ -20,16 +20,18 @@ use crate::local::LocalFileSystem; use crate::memory::InMemory; use crate::path::Path; use crate::ObjectStore; -use snafu::Snafu; use url::Url; -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub enum Error { - #[snafu(display("Unable to recognise URL \"{}\"", url))] + #[error("Unable to recognise URL \"{}\"", url)] Unrecognised { url: Url }, - #[snafu(context(false))] - Path { source: crate::path::Error }, + #[error(transparent)] + Path { + #[from] + source: crate::path::Error, + }, } impl From for super::Error { diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 4c9bb5f05186..f8affe8dfbb9 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -19,7 +19,6 @@ use itertools::Itertools; use percent_encoding::percent_decode; -use snafu::{ensure, ResultExt, Snafu}; use std::fmt::Formatter; #[cfg(not(target_arch = "wasm32"))] use url::Url; @@ -35,18 +34,18 @@ mod parts; pub use parts::{InvalidPart, PathPart}; /// Error returned by [`Path::parse`] -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum Error { /// Error when there's an empty segment between two slashes `/` in the path - #[snafu(display("Path \"{}\" contained empty path segment", path))] + #[error("Path \"{}\" contained empty path segment", path)] EmptySegment { /// The source path path: String, }, /// Error when an invalid segment is encountered in the given path - #[snafu(display("Error parsing Path \"{}\": {}", path, source))] + #[error("Error parsing Path \"{}\": {}", path, source)] BadSegment { /// The source path path: String, @@ -55,7 +54,7 @@ pub enum Error { }, /// Error when path cannot be canonicalized - #[snafu(display("Failed to canonicalize path \"{}\": {}", path.display(), source))] + #[error("Failed to canonicalize path \"{}\": {}", path.display(), source)] Canonicalize { /// The source path path: std::path::PathBuf, @@ -64,14 +63,14 @@ pub enum Error { }, /// Error when the path is not a valid URL - #[snafu(display("Unable to convert path \"{}\" to URL", path.display()))] + #[error("Unable to convert path \"{}\" to URL", path.display())] InvalidPath { /// The source path path: std::path::PathBuf, }, /// Error when a path contains non-unicode characters - #[snafu(display("Path \"{}\" contained non-unicode characters: {}", path, source))] + #[error("Path \"{}\" contained non-unicode characters: {}", path, source)] NonUnicode { /// The source path path: String, @@ -80,7 +79,7 @@ pub enum Error { }, /// Error when the a path doesn't start with given prefix - #[snafu(display("Path {} does not start with prefix {}", path, prefix))] + #[error("Path {} does not start with prefix {}", path, prefix)] PrefixMismatch { /// The source path path: String, @@ -173,8 +172,14 @@ impl Path { let stripped = stripped.strip_suffix(DELIMITER).unwrap_or(stripped); for segment in stripped.split(DELIMITER) { - ensure!(!segment.is_empty(), EmptySegmentSnafu { path }); - PathPart::parse(segment).context(BadSegmentSnafu { path })?; + if segment.is_empty() { + return Err(Error::EmptySegment { path: path.into() }); + } + + PathPart::parse(segment).map_err(|source| { + let path = path.into(); + Error::BadSegment { source, path } + })?; } Ok(Self { @@ -190,8 +195,9 @@ impl Path { /// /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path(path: impl AsRef) -> Result { - let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { - path: path.as_ref(), + let absolute = std::fs::canonicalize(&path).map_err(|source| { + let path = path.as_ref().into(); + Error::Canonicalize { source, path } })?; Self::from_absolute_path(absolute) @@ -241,7 +247,10 @@ impl Path { let path = path.as_ref(); let decoded = percent_decode(path.as_bytes()) .decode_utf8() - .context(NonUnicodeSnafu { path })?; + .map_err(|source| { + let path = path.into(); + Error::NonUnicode { source, path } + })?; Self::parse(decoded) } diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index de2e1a75c955..9c6612bf9331 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -19,15 +19,14 @@ use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; -use snafu::Snafu; /// Error returned by [`PathPart::parse`] -#[derive(Debug, Snafu)] -#[snafu(display( +#[derive(Debug, thiserror::Error)] +#[error( "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", illegal, segment -))] +)] #[allow(missing_copy_implementations)] pub struct InvalidPart { segment: String, diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 99102a99e61e..6d638f3cb2b8 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -24,7 +24,6 @@ use std::{ use super::Result; use bytes::Bytes; use futures::{stream::StreamExt, Stream, TryStreamExt}; -use snafu::Snafu; #[cfg(any(feature = "azure", feature = "http"))] pub(crate) static RFC1123_FMT: &str = "%a, %d %h %Y %T GMT"; @@ -204,14 +203,12 @@ pub enum GetRange { Suffix(usize), } -#[derive(Debug, Snafu)] +#[derive(Debug, thiserror::Error)] pub(crate) enum InvalidGetRange { - #[snafu(display( - "Wanted range starting at {requested}, but object was only {length} bytes long" - ))] + #[error("Wanted range starting at {requested}, but object was only {length} bytes long")] StartTooLarge { requested: usize, length: usize }, - #[snafu(display("Range started at {start} and ended at {end}"))] + #[error("Range started at {start} and ended at {end}")] Inconsistent { start: usize, end: usize }, } From debd2e872c11f7a4f06488a88a1ad170fca41145 Mon Sep 17 00:00:00 2001 From: Kikkon <19528375+Kikkon@users.noreply.github.com> Date: Fri, 3 Jan 2025 19:44:34 +0800 Subject: [PATCH 27/68] feat: add GenericListViewBuilder (#6552) * feat: add GenericListViewBuilder * remove uszie * fix tests * remove static * lint * chore: add comment for should fail test * Update arrow-array/src/builder/generic_list_view_builder.rs Co-authored-by: Marco Neumann * Update arrow-array/src/builder/generic_list_view_builder.rs Co-authored-by: Marco Neumann * fix name & lint --------- Co-authored-by: Marco Neumann --- .../src/builder/generic_list_view_builder.rs | 707 ++++++++++++++++++ arrow-array/src/builder/mod.rs | 8 + arrow-array/src/builder/struct_builder.rs | 10 + arrow-array/src/cast.rs | 16 + 4 files changed, 741 insertions(+) create mode 100644 arrow-array/src/builder/generic_list_view_builder.rs diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs new file mode 100644 index 000000000000..5aaf9efefe24 --- /dev/null +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -0,0 +1,707 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`GenericListViewArray`] +#[derive(Debug)] +pub struct GenericListViewBuilder { + offsets_builder: BufferBuilder, + sizes_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, + current_offset: OffsetSize, +} + +impl Default for GenericListViewBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl ArrayBuilder + for GenericListViewBuilder +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListViewBuilder { + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + current_offset: OffsetSize::zero(), + } + } + + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl GenericListViewBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool) { + self.offsets_builder.append(self.current_offset); + self.sizes_builder.append( + OffsetSize::from_usize( + self.values_builder.len() - self.current_offset.to_usize().unwrap(), + ) + .unwrap(), + ); + self.null_buffer_builder.append(is_valid); + self.current_offset = OffsetSize::from_usize(self.values_builder.len()).unwrap(); + } + + /// Append value into this [`GenericListViewBuilder`] + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListViewBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder.append(self.current_offset); + self.sizes_builder + .append(OffsetSize::from_usize(0).unwrap()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListViewBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListViewArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + let offsets = self.offsets_builder.finish(); + self.current_offset = OffsetSize::zero(); + + // Safety: Safe by construction + let offsets = ScalarBuffer::from(offsets); + let sizes = self.sizes_builder.finish(); + let sizes = ScalarBuffer::from(sizes); + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Builds the [`GenericListViewArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListViewArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = ScalarBuffer::from(offsets); + + let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); + let sizes = ScalarBuffer::from(sizes); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } +} + +impl Extend> for GenericListViewBuilder +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + self.values_builder.extend(elements); + self.append(true); + } + None => self.append(false), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{make_builder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Array, Int32Array}; + use arrow_schema::DataType; + + fn test_generic_list_view_array_builder_impl() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6].map(O::usize_as)); + assert_eq!(list_array.value_sizes(), [3, 3, 2].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_sizes()[2]); + for i in 0..2 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_builder() { + test_generic_list_view_array_builder_impl::() + } + + #[test] + fn test_large_list_view_array_builder() { + test_generic_list_view_array_builder_impl::() + } + + fn test_generic_list_view_array_builder_nulls_impl() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.append(false); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); + } + + #[test] + fn test_list_view_array_builder_nulls() { + test_generic_list_view_array_builder_nulls_impl::() + } + + #[test] + fn test_large_list_view_array_builder_nulls() { + test_generic_list_view_array_builder_nulls_impl::() + } + + #[test] + fn test_list_view_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_list_view_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListViewBuilder::new(primitive_builder); + let mut builder = ListViewBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true); + builder.append(true); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true); + builder.values().append(false); + builder.values().values().append_value(8); + builder.values().append(true); + builder.append(true); + + builder.append(false); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true); + builder.append(true); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5]); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1]); + + let l2 = l1.values().as_list_view::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8]); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6]); + assert_eq!(array.value_sizes(), [4, 0, 2, 0]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListViewBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_view_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_view_large_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + fn test_boxed_generic_list_view_generic_list_view_array_builder( + values_builder: Box, + ) where + O: OffsetSizeTrait + PartialEq, + { + let mut builder: GenericListViewBuilder> = + GenericListViewBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListViewBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(false); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + builder.append(false); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true); + builder.append(true); + + let l1 = builder.finish(); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5].map(O::usize_as)); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1].map(O::usize_as)); + + let l2 = l1.values().as_list_view::(); + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8].map(O::usize_as)); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::ListView(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::ListView(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic( + expected = r#"Non-nullable field of ListViewArray \"item\" cannot contain nulls"# + )] + // If a non-nullable type is declared but a null value is used, it will be intercepted by the null check. + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListViewArray expected data type Int64 got Int32")] + // If the declared type does not match the actual appended type, it will be intercepted by type checking in the finish function. + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } +} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 89a96280eb87..982e8788b90d 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -180,6 +180,8 @@ mod generic_byte_run_builder; pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; +mod generic_list_view_builder; +pub use generic_list_view_builder::*; mod union_builder; pub use union_builder::*; @@ -304,6 +306,12 @@ pub type ListBuilder = GenericListBuilder; /// Builder for [`LargeListArray`](crate::array::LargeListArray) pub type LargeListBuilder = GenericListBuilder; +/// Builder for [`ListViewArray`](crate::array::ListViewArray) +pub type ListViewBuilder = GenericListViewBuilder; + +/// Builder for [`LargeListViewArray`](crate::array::LargeListViewArray) +pub type LargeListViewBuilder = GenericListViewBuilder; + /// Builder for [`BinaryArray`](crate::array::BinaryArray) /// /// See examples on [`GenericBinaryBuilder`] diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 69c551fdb5b6..c7299d076ab0 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -276,6 +276,16 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeListView(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new( + LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone()), + ) + } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { let map_field_names = MapFieldNames { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 9947c36d4619..0b76193c7565 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -838,6 +838,14 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`GenericListViewArray`] returning `None` if not possible + fn as_list_view_opt(&self) -> Option<&GenericListViewArray>; + + /// Downcast this to a [`GenericListViewArray`] panicking if not possible + fn as_list_view(&self) -> &GenericListViewArray { + self.as_list_view_opt().expect("list view array") + } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; @@ -911,6 +919,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_any().downcast_ref() } @@ -966,6 +978,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_ref().as_list_view_opt() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_ref().as_fixed_size_binary_opt() } From d31e780d652aba6966e975ab4c20ef9c90bbd242 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:29:36 +0100 Subject: [PATCH 28/68] Update itertools requirement from 0.13.0 to 0.14.0 in /object_store (#6925) Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version. - [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md) - [Commits](https://github.com/rust-itertools/itertools/compare/v0.13.0...v0.14.0) --- updated-dependencies: - dependency-name: itertools dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- object_store/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 6f5e9db1bc70..992ae6662cdb 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -35,7 +35,7 @@ bytes = "1.0" chrono = { version = "0.4.34", default-features = false, features = ["clock"] } futures = "0.3" humantime = "2.1" -itertools = "0.13.0" +itertools = "0.14.0" parking_lot = { version = "0.12" } percent-encoding = "2.1" thiserror = "2.0.2" From e4989aad5acb8b62cfe98b130f78961f8bcc34bb Mon Sep 17 00:00:00 2001 From: wiedld Date: Sat, 4 Jan 2025 05:21:59 -0500 Subject: [PATCH 29/68] Document how to use Extend for generic methods on ArrayBuilders (#6932) * chore: add docs for how to use Extend for generic methods on ArrayBuilders * chore: move to mod docs and add more examples --- arrow-array/src/builder/mod.rs | 69 ++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 982e8788b90d..29d75024ea72 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -78,6 +78,73 @@ //! )) //! ``` //! +//! # Using the [`Extend`] trait to append values from an iterable: +//! +//! ``` +//! # use arrow_array::{Array}; +//! # use arrow_array::builder::{ArrayBuilder, StringBuilder}; +//! +//! let mut builder = StringBuilder::new(); +//! builder.extend(vec![Some("🍐"), Some("🍎"), None]); +//! assert_eq!(builder.finish().len(), 3); +//! ``` +//! +//! # Using the [`Extend`] trait to write generic functions: +//! +//! ``` +//! # use arrow_array::{Array, ArrayRef, StringArray}; +//! # use arrow_array::builder::{ArrayBuilder, Int32Builder, ListBuilder, StringBuilder}; +//! +//! // For generic methods that fill a list of values for an [`ArrayBuilder`], use the [`Extend`] trait. +//! fn filter_and_fill>(builder: &mut impl Extend, values: I, filter: V) +//! where V: PartialEq +//! { +//! builder.extend(values.into_iter().filter(|v| *v == filter)); +//! } +//! let mut string_builder = StringBuilder::new(); +//! filter_and_fill( +//! &mut string_builder, +//! vec![Some("🍐"), Some("🍎"), None], +//! Some("🍎"), +//! ); +//! assert_eq!(string_builder.finish().len(), 1); +//! +//! let mut int_builder = Int32Builder::new(); +//! filter_and_fill( +//! &mut int_builder, +//! vec![Some(11), Some(42), None], +//! Some(42), +//! ); +//! assert_eq!(int_builder.finish().len(), 1); +//! +//! // For generic methods that fill lists-of-lists for an [`ArrayBuilder`], use the [`Extend`] trait. +//! fn filter_and_fill_if_contains>>( +//! list_builder: &mut impl Extend>, +//! values: I, +//! filter: Option, +//! ) where +//! T: PartialEq, +//! for<'a> &'a V: IntoIterator>, +//! { +//! list_builder.extend(values.into_iter().filter(|string: &Option| { +//! string +//! .as_ref() +//! .map(|str: &V| str.into_iter().any(|ch: &Option| ch == &filter)) +//! .unwrap_or(false) +//! })); +//! } +//! let builder = StringBuilder::new(); +//! let mut list_builder = ListBuilder::new(builder); +//! let pear_pear = vec![Some("🍐"),Some("🍐")]; +//! let pear_app = vec![Some("🍐"),Some("🍎")]; +//! filter_and_fill_if_contains( +//! &mut list_builder, +//! vec![Some(pear_pear), Some(pear_app), None], +//! Some("🍎"), +//! ); +//! assert_eq!(list_builder.finish().len(), 1); +//! ``` +//! //! # Custom Builders //! //! It is common to have a collection of statically defined Rust types that @@ -134,6 +201,8 @@ //! } //! } //! +//! /// For building arrays in generic code, use Extend instead of the append_* methods +//! /// e.g. append_value, append_option, append_null //! impl<'a> Extend<&'a MyRow> for MyRowBuilder { //! fn extend>(&mut self, iter: T) { //! iter.into_iter().for_each(|row| self.append(row)); From ce4be76a40a9dd4818e229261da49b89635becfd Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Sat, 4 Jan 2025 05:23:12 -0500 Subject: [PATCH 30/68] [Parquet] Add projection utility functions (#6931) * projection utilities * improve docs --------- Co-authored-by: Andrew Lamb --- parquet/src/arrow/mod.rs | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 1305bbac83f0..35f5897c18f8 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -281,6 +281,45 @@ impl ProjectionMask { pub fn leaf_included(&self, leaf_idx: usize) -> bool { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) } + + /// Union two projection masks + /// + /// Example: + /// ```text + /// mask1 = [true, false, true] + /// mask2 = [false, true, true] + /// union(mask1, mask2) = [true, true, true] + /// ``` + pub fn union(&mut self, other: &Self) { + match (self.mask.as_ref(), other.mask.as_ref()) { + (None, _) | (_, None) => self.mask = None, + (Some(a), Some(b)) => { + debug_assert_eq!(a.len(), b.len()); + let mask = a.iter().zip(b.iter()).map(|(&a, &b)| a || b).collect(); + self.mask = Some(mask); + } + } + } + + /// Intersect two projection masks + /// + /// Example: + /// ```text + /// mask1 = [true, false, true] + /// mask2 = [false, true, true] + /// intersect(mask1, mask2) = [false, false, true] + /// ``` + pub fn intersect(&mut self, other: &Self) { + match (self.mask.as_ref(), other.mask.as_ref()) { + (None, _) => self.mask = other.mask.clone(), + (_, None) => {} + (Some(a), Some(b)) => { + debug_assert_eq!(a.len(), b.len()); + let mask = a.iter().zip(b.iter()).map(|(&a, &b)| a && b).collect(); + self.mask = Some(mask); + } + } + } } /// Lookups up the parquet column by name @@ -551,4 +590,66 @@ mod test { let mask = ProjectionMask::columns(&schema, ["a", "e"]); assert_eq!(mask.mask.unwrap(), [true, false, true, false, true]); } + + #[test] + fn test_projection_mask_union() { + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.union(&mask2); + assert_eq!(mask1.mask, Some(vec![true, true, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { mask: None }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { mask: None }; + mask1.union(&mask2); + assert_eq!(mask1.mask, None); + } + + #[test] + fn test_projection_mask_intersect() { + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![false, false, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { + mask: Some(vec![false, true, true]), + }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![false, true, true])); + + let mut mask1 = ProjectionMask { + mask: Some(vec![true, false, true]), + }; + let mask2 = ProjectionMask { mask: None }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, Some(vec![true, false, true])); + + let mut mask1 = ProjectionMask { mask: None }; + let mask2 = ProjectionMask { mask: None }; + mask1.intersect(&mask2); + assert_eq!(mask1.mask, None); + } } From 94a079f0e67b08910632acb26aa128138c3941b5 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sat, 4 Jan 2025 12:24:48 +0200 Subject: [PATCH 31/68] feat(arrow-select): `concat` kernel will merge dictionary values for list of dictionaries (#6893) * feat(arrow-select): make list of dictionary merge dictionary keys TODO: - [ ] Add support to nested lists - [ ] Add more tests - [ ] Fix failing test * fix concat lists of dictionaries * format * remove unused import * improve test helper * feat: add merge offset buffers into one * format * add reproduction tst * recommit * fix clippy * fix clippy * fix clippy * improve offsets code according to code review * use concat dictionaries * add specialize code to concat lists to be able to use the concat dictionary logic * remove the use of ArrayData --- arrow-buffer/src/buffer/offset.rs | 52 ++++++++ arrow-select/src/concat.rs | 191 ++++++++++++++++++++++++++++-- 2 files changed, 232 insertions(+), 11 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index a6be2b67af84..164af6f01d0e 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -133,6 +133,38 @@ impl OffsetBuffer { Self(out.into()) } + /// Get an Iterator over the lengths of this [`OffsetBuffer`] + /// + /// ``` + /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + /// assert_eq!(offsets.lengths().collect::>(), vec![1, 3, 5]); + /// ``` + /// + /// Empty [`OffsetBuffer`] will return an empty iterator + /// ``` + /// # use arrow_buffer::OffsetBuffer; + /// let offsets = OffsetBuffer::::new_empty(); + /// assert_eq!(offsets.lengths().count(), 0); + /// ``` + /// + /// This can be used to merge multiple [`OffsetBuffer`]s to one + /// ``` + /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + /// + /// let buffer1 = OffsetBuffer::::from_lengths([2, 6, 3, 7, 2]); + /// let buffer2 = OffsetBuffer::::from_lengths([1, 3, 5, 7, 9]); + /// + /// let merged = OffsetBuffer::::from_lengths( + /// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths()) + /// ); + /// + /// assert_eq!(merged.lengths().collect::>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]); + /// ``` + pub fn lengths(&self) -> impl ExactSizeIterator + '_ { + self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize()) + } + /// Free up unused memory. pub fn shrink_to_fit(&mut self) { self.0.shrink_to_fit(); @@ -244,4 +276,24 @@ mod tests { fn from_lengths_usize_overflow() { OffsetBuffer::::from_lengths([usize::MAX, 1]); } + + #[test] + fn get_lengths() { + let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + assert_eq!(offsets.lengths().collect::>(), vec![1, 3, 5]); + } + + #[test] + fn get_lengths_should_be_with_fixed_size() { + let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); + let iter = offsets.lengths(); + assert_eq!(iter.size_hint(), (3, Some(3))); + assert_eq!(iter.len(), 3); + } + + #[test] + fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() { + let offsets = OffsetBuffer::::new_empty(); + assert_eq!(offsets.lengths().collect::>(), vec![]); + } } diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 129b90ee0470..4855e0087cc6 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -34,9 +34,9 @@ use crate::dictionary::{merge_dictionary_values, should_merge_dictionary_values} use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer}; +use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer}; use arrow_data::transform::{Capacities, MutableArrayData}; -use arrow_schema::{ArrowError, DataType, SchemaRef}; +use arrow_schema::{ArrowError, DataType, FieldRef, SchemaRef}; use std::sync::Arc; fn binary_capacity(arrays: &[&dyn Array]) -> Capacities { @@ -129,6 +129,54 @@ fn concat_dictionaries( Ok(Arc::new(array)) } +fn concat_lists( + arrays: &[&dyn Array], + field: &FieldRef, +) -> Result { + let mut output_len = 0; + let mut list_has_nulls = false; + + let lists = arrays + .iter() + .map(|x| x.as_list::()) + .inspect(|l| { + output_len += l.len(); + list_has_nulls |= l.null_count() != 0; + }) + .collect::>(); + + let lists_nulls = list_has_nulls.then(|| { + let mut nulls = BooleanBufferBuilder::new(output_len); + for l in &lists { + match l.nulls() { + Some(n) => nulls.append_buffer(n.inner()), + None => nulls.append_n(l.len(), true), + } + } + NullBuffer::new(nulls.finish()) + }); + + let values: Vec<&dyn Array> = lists + .iter() + .map(|x| x.values().as_ref()) + .collect::>(); + + let concatenated_values = concat(values.as_slice())?; + + // Merge value offsets from the lists + let value_offset_buffer = + OffsetBuffer::::from_lengths(lists.iter().flat_map(|x| x.offsets().lengths())); + + let array = GenericListArray::::try_new( + Arc::clone(field), + value_offset_buffer, + concatenated_values, + lists_nulls, + )?; + + Ok(Arc::new(array)) +} + macro_rules! dict_helper { ($t:ty, $arrays:expr) => { return Ok(Arc::new(concat_dictionaries::<$t>($arrays)?) as _) @@ -163,14 +211,20 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { "It is not possible to concatenate arrays of different data types.".to_string(), )); } - if let DataType::Dictionary(k, _) = d { - downcast_integer! { - k.as_ref() => (dict_helper, arrays), - _ => unreachable!("illegal dictionary key type {k}") - }; - } else { - let capacity = get_capacity(arrays, d); - concat_fallback(arrays, capacity) + + match d { + DataType::Dictionary(k, _) => { + downcast_integer! { + k.as_ref() => (dict_helper, arrays), + _ => unreachable!("illegal dictionary key type {k}") + } + } + DataType::List(field) => concat_lists::(arrays, field), + DataType::LargeList(field) => concat_lists::(arrays, field), + _ => { + let capacity = get_capacity(arrays, d); + concat_fallback(arrays, capacity) + } } } @@ -228,8 +282,9 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::StringDictionaryBuilder; + use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder}; use arrow_schema::{Field, Schema}; + use std::fmt::Debug; #[test] fn test_concat_empty_vec() { @@ -851,4 +906,118 @@ mod tests { assert_eq!(array.null_count(), 10); assert_eq!(array.logical_null_count(), 10); } + + #[test] + fn concat_dictionary_list_array_simple() { + let scalars = vec![ + create_single_row_list_of_dict(vec![Some("a")]), + create_single_row_list_of_dict(vec![Some("a")]), + create_single_row_list_of_dict(vec![Some("b")]), + ]; + + let arrays = scalars + .iter() + .map(|a| a as &(dyn Array)) + .collect::>(); + let concat_res = concat(arrays.as_slice()).unwrap(); + + let expected_list = create_list_of_dict(vec![ + // Row 1 + Some(vec![Some("a")]), + Some(vec![Some("a")]), + Some(vec![Some("b")]), + ]); + + let list = concat_res.as_list::(); + + // Assert that the list is equal to the expected list + list.iter().zip(expected_list.iter()).for_each(|(a, b)| { + assert_eq!(a, b); + }); + + assert_dictionary_has_unique_values::<_, StringArray>( + list.values().as_dictionary::(), + ); + } + + #[test] + fn concat_many_dictionary_list_arrays() { + let number_of_unique_values = 8; + let scalars = (0..80000) + .map(|i| { + create_single_row_list_of_dict(vec![Some( + (i % number_of_unique_values).to_string(), + )]) + }) + .collect::>(); + + let arrays = scalars + .iter() + .map(|a| a as &(dyn Array)) + .collect::>(); + let concat_res = concat(arrays.as_slice()).unwrap(); + + let expected_list = create_list_of_dict( + (0..80000) + .map(|i| Some(vec![Some((i % number_of_unique_values).to_string())])) + .collect::>(), + ); + + let list = concat_res.as_list::(); + + // Assert that the list is equal to the expected list + list.iter().zip(expected_list.iter()).for_each(|(a, b)| { + assert_eq!(a, b); + }); + + assert_dictionary_has_unique_values::<_, StringArray>( + list.values().as_dictionary::(), + ); + } + + fn create_single_row_list_of_dict( + list_items: Vec>>, + ) -> GenericListArray { + let rows = list_items.into_iter().map(Some).collect(); + + create_list_of_dict(vec![rows]) + } + + fn create_list_of_dict( + rows: Vec>>>>, + ) -> GenericListArray { + let mut builder = + GenericListBuilder::::new(StringDictionaryBuilder::::new()); + + for row in rows { + builder.append_option(row); + } + + builder.finish() + } + + fn assert_dictionary_has_unique_values<'a, K, V>(array: &'a DictionaryArray) + where + K: ArrowDictionaryKeyType, + V: Sync + Send + 'static, + &'a V: ArrayAccessor + IntoIterator, + + <&'a V as ArrayAccessor>::Item: Default + Clone + PartialEq + Debug + Ord, + <&'a V as IntoIterator>::Item: Clone + PartialEq + Debug + Ord, + { + let dict = array.downcast_dict::().unwrap(); + let mut values = dict.values().into_iter().collect::>(); + + // remove duplicates must be sorted first so we can compare + values.sort(); + + let mut unique_values = values.clone(); + + unique_values.dedup(); + + assert_eq!( + values, unique_values, + "There are duplicates in the value list (the value list here is sorted which is only for the assertion)" + ); + } } From 91e9380f20cb18638ec4f0d249c35182afc01a24 Mon Sep 17 00:00:00 2001 From: Himadri Pal Date: Sat, 4 Jan 2025 11:21:37 -0800 Subject: [PATCH 32/68] remove println (#6935) --- arrow-cast/src/cast/mod.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 0946af53a60f..0e56d7633a80 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4072,7 +4072,6 @@ mod tests { Arc::new(StringViewArray::from(vec![Some("1.5"), Some("2.5"), None])); for array in inputs { - println!("type: {}", array.data_type()); assert!(can_cast_types(array.data_type(), &DataType::Utf8View)); let arr = cast(&array, &DataType::Utf8View).unwrap(); assert_eq!(expected.as_ref(), arr.as_ref()); @@ -10318,7 +10317,6 @@ mod tests { fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { let array = vec![Some(123456789)]; let array = create_decimal128_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 2); assert!(can_cast_types(&input_type, &output_type)); @@ -10336,7 +10334,6 @@ mod tests { fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() { let array = vec![Some(123456789)]; let array = create_decimal128_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 4); let output_type = DataType::Decimal128(6, 2); assert!(can_cast_types(&input_type, &output_type)); @@ -10347,14 +10344,13 @@ mod tests { }; let result = cast_with_options(&array, &output_type, &options); assert_eq!(result.unwrap_err().to_string(), - "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + "Invalid argument error: 1234568 is too large to store in a Decimal128 of precision 6. Max is 999999"); } #[test] fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() { let array = vec![Some(123456789)]; let array = create_decimal128_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal128(6, 3); assert!(can_cast_types(&input_type, &output_type)); @@ -10372,7 +10368,6 @@ mod tests { fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() { let array = vec![Some(123456789)]; let array = create_decimal128_array(array, 24, 2).unwrap(); - println!("{:?}", array); let input_type = DataType::Decimal128(24, 2); let output_type = DataType::Decimal256(6, 2); assert!(can_cast_types(&input_type, &output_type)); From 005bfe8ae1c1f462154228ddcd57d29716bd5795 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 4 Jan 2025 14:32:17 -0500 Subject: [PATCH 33/68] Minor: improve `zip` kernel docs, add examples (#6928) * Minor: improve `zip` kernel docs` * Add example for zip with scalar --- arrow-select/src/zip.rs | 66 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs index acb31dfa3bc2..2efd2e749921 100644 --- a/arrow-select/src/zip.rs +++ b/arrow-select/src/zip.rs @@ -15,20 +15,72 @@ // specific language governing permissions and limitations // under the License. -//! Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` +//! [`zip`]: Combine values from two arrays based on boolean mask use crate::filter::SlicesIterator; use arrow_array::*; use arrow_data::transform::MutableArrayData; use arrow_schema::ArrowError; -/// Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` -/// are taken, where the mask evaluates `false` values of `falsy` are taken. +/// Zip two arrays by some boolean mask. /// -/// # Arguments -/// * `mask` - Boolean values used to determine from which array to take the values. -/// * `truthy` - Values of this array are taken if mask evaluates `true` -/// * `falsy` - Values of this array are taken if mask evaluates `false` +/// - Where `mask` is `true`, values of `truthy` are taken +/// - Where `mask` is `false` or `NULL`, values of `falsy` are taken +/// +/// # Example: `zip` two arrays +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; +/// # use arrow_select::zip::zip; +/// // mask: [true, true, false, NULL, true] +/// let mask = BooleanArray::from(vec![ +/// Some(true), Some(true), Some(false), None, Some(true) +/// ]); +/// // truthy array: [1, NULL, 3, 4, 5] +/// let truthy = Int32Array::from(vec![ +/// Some(1), None, Some(3), Some(4), Some(5) +/// ]); +/// // falsy array: [10, 20, 30, 40, 50] +/// let falsy = Int32Array::from(vec![ +/// Some(10), Some(20), Some(30), Some(40), Some(50) +/// ]); +/// // zip with this mask select the first, second and last value from `truthy` +/// // and the third and fourth value from `falsy` +/// let result = zip(&mask, &truthy, &falsy).unwrap(); +/// // Expected: [1, NULL, 30, 40, 5] +/// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ +/// Some(1), None, Some(30), Some(40), Some(5) +/// ])); +/// assert_eq!(&result, &expected); +/// ``` +/// +/// # Example: `zip` and array with a scalar +/// +/// Use `zip` to replace certain values in an array with a scalar +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray, Int32Array}; +/// # use arrow_select::zip::zip; +/// // mask: [true, true, false, NULL, true] +/// let mask = BooleanArray::from(vec![ +/// Some(true), Some(true), Some(false), None, Some(true) +/// ]); +/// // array: [1, NULL, 3, 4, 5] +/// let arr = Int32Array::from(vec![ +/// Some(1), None, Some(3), Some(4), Some(5) +/// ]); +/// // scalar: 42 +/// let scalar = Int32Array::new_scalar(42); +/// // zip the array with the mask select the first, second and last value from `arr` +/// // and fill the third and fourth value with the scalar 42 +/// let result = zip(&mask, &arr, &scalar).unwrap(); +/// // Expected: [1, NULL, 42, 42, 5] +/// let expected: ArrayRef = Arc::new(Int32Array::from(vec![ +/// Some(1), None, Some(42), Some(42), Some(5) +/// ])); +/// assert_eq!(&result, &expected); +/// ``` pub fn zip( mask: &BooleanArray, truthy: &dyn Datum, From 496de6692946614e2133aac680c7cf37cee27b2a Mon Sep 17 00:00:00 2001 From: Vrishabh Date: Sun, 5 Jan 2025 15:54:14 +0530 Subject: [PATCH 34/68] Minor clippy fixes (#6942) --- arrow-cast/src/parse.rs | 14 ++++---------- arrow-flight/src/encode.rs | 2 +- object_store/src/aws/client.rs | 2 +- object_store/src/azure/client.rs | 2 +- object_store/src/gcp/client.rs | 2 +- parquet/src/arrow/async_reader/mod.rs | 2 +- parquet/src/column/writer/mod.rs | 4 ++-- 7 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index f4c4639c1c08..4e93e9787cc8 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -881,7 +881,7 @@ pub fn parse_decimal( for (_, b) in bs.by_ref() { if !b.is_ascii_digit() { if *b == b'e' || *b == b'E' { - result = match parse_e_notation::( + result = parse_e_notation::( s, digits as u16, fractionals as i16, @@ -889,10 +889,7 @@ pub fn parse_decimal( point_index, precision as u16, scale as i16, - ) { - Err(e) => return Err(e), - Ok(v) => v, - }; + )?; is_e_notation = true; @@ -926,7 +923,7 @@ pub fn parse_decimal( } } b'e' | b'E' => { - result = match parse_e_notation::( + result = parse_e_notation::( s, digits as u16, fractionals as i16, @@ -934,10 +931,7 @@ pub fn parse_decimal( index, precision as u16, scale as i16, - ) { - Err(e) => return Err(e), - Ok(v) => v, - }; + )?; is_e_notation = true; diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 315b7b3cb6e5..19fe42474405 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1833,7 +1833,7 @@ mod tests { .flight_descriptor .as_ref() .map(|descriptor| { - let path_len: usize = descriptor.path.iter().map(|p| p.as_bytes().len()).sum(); + let path_len: usize = descriptor.path.iter().map(|p| p.len()).sum(); std::mem::size_of_val(descriptor) + descriptor.cmd.len() + path_len }) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index 25fdd3311c95..b81be0c0efad 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -299,7 +299,7 @@ pub(crate) struct Request<'a> { retry_error_body: bool, } -impl<'a> Request<'a> { +impl Request<'_> { pub(crate) fn query(self, query: &T) -> Self { let builder = self.builder.query(query); Self { builder, ..self } diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index ea3a5faf3ad8..bd72d0c6aee1 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -198,7 +198,7 @@ struct PutRequest<'a> { idempotent: bool, } -impl<'a> PutRequest<'a> { +impl PutRequest<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index 1928d13b4739..d6f89ca71740 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -173,7 +173,7 @@ pub(crate) struct Request<'a> { idempotent: bool, } -impl<'a> Request<'a> { +impl Request<'_> { fn header(self, k: &HeaderName, v: &str) -> Self { let builder = self.builder.header(k, v); Self { builder, ..self } diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 96715e1164b2..4f3befe42662 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -792,7 +792,7 @@ struct InMemoryRowGroup<'a> { row_count: usize, } -impl<'a> InMemoryRowGroup<'a> { +impl InMemoryRowGroup<'_> { /// Fetches the necessary column data into memory async fn fetch( &mut self, diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8dc1d0db4476..5f34f34cbb7a 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -3275,8 +3275,8 @@ mod tests { fn test_truncate_utf8() { // No-op let data = "❤️🧡💛💚💙💜"; - let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); - assert_eq!(r.len(), data.as_bytes().len()); + let r = truncate_utf8(data, data.len()).unwrap(); + assert_eq!(r.len(), data.len()); assert_eq!(&r, data.as_bytes()); // We slice it away from the UTF8 boundary From 30f46c73b021256743a5ef9ea6062f20c0553cc2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 6 Jan 2025 16:03:51 -0500 Subject: [PATCH 35/68] Improve `Buffer` documentation, deprecate `Buffer::from_bytes` add `From` and `From` impls (#6939) * Improve Bytes documentation * Improve Buffer documentation, add From and From impls * avoid linking to private docs * Deprecate `Buffer::from_bytes` * Apply suggestions from code review Co-authored-by: Jeffrey Vo --------- Co-authored-by: Jeffrey Vo --- arrow-buffer/src/buffer/immutable.rs | 118 ++++++++++++++---- arrow-buffer/src/buffer/mutable.rs | 2 +- arrow-buffer/src/bytes.rs | 8 +- arrow-flight/src/decode.rs | 2 +- arrow-flight/src/sql/client.rs | 2 +- .../src/arrow/array_reader/byte_view_array.rs | 10 +- 6 files changed, 104 insertions(+), 38 deletions(-) diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index cf1d6f366751..fd145ce2306e 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -28,8 +28,43 @@ use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; -/// Buffer represents a contiguous memory region that can be shared with other buffers and across -/// thread boundaries. +/// A contiguous memory region that can be shared with other buffers and across +/// thread boundaries that stores Arrow data. +/// +/// `Buffer`s can be sliced and cloned without copying the underlying data and can +/// be created from memory allocated by non-Rust sources such as C/C++. +/// +/// # Example: Create a `Buffer` from a `Vec` (without copying) +/// ``` +/// # use arrow_buffer::Buffer; +/// let vec: Vec = vec![1, 2, 3]; +/// let buffer = Buffer::from(vec); +/// ``` +/// +/// # Example: Convert a `Buffer` to a `Vec` (without copying) +/// +/// Use [`Self::into_vec`] to convert a `Buffer` back into a `Vec` if there are +/// no other references and the types are aligned correctly. +/// ``` +/// # use arrow_buffer::Buffer; +/// # let vec: Vec = vec![1, 2, 3]; +/// # let buffer = Buffer::from(vec); +/// // convert the buffer back into a Vec of u32 +/// // note this will fail if the buffer is shared or not aligned correctly +/// let vec: Vec = buffer.into_vec().unwrap(); +/// ``` +/// +/// # Example: Create a `Buffer` from a [`bytes::Bytes`] (without copying) +/// +/// [`bytes::Bytes`] is a common type in the Rust ecosystem for shared memory +/// regions. You can create a buffer from a `Bytes` instance using the `From` +/// implementation, also without copying. +/// +/// ``` +/// # use arrow_buffer::Buffer; +/// let bytes = bytes::Bytes::from("hello"); +/// let buffer = Buffer::from(bytes); +///``` #[derive(Clone, Debug)] pub struct Buffer { /// the internal byte buffer. @@ -59,24 +94,15 @@ unsafe impl Send for Buffer where Bytes: Send {} unsafe impl Sync for Buffer where Bytes: Sync {} impl Buffer { - /// Auxiliary method to create a new Buffer + /// Create a new Buffer from a (internal) `Bytes` /// - /// This can be used with a [`bytes::Bytes`] via `into()`: + /// NOTE despite the same name, `Bytes` is an internal struct in arrow-rs + /// and is different than [`bytes::Bytes`]. /// - /// ``` - /// # use arrow_buffer::Buffer; - /// let bytes = bytes::Bytes::from_static(b"foo"); - /// let buffer = Buffer::from_bytes(bytes.into()); - /// ``` - #[inline] + /// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`]. + #[deprecated(since = "54.1.0", note = "Use Buffer::from instead")] pub fn from_bytes(bytes: Bytes) -> Self { - let length = bytes.len(); - let ptr = bytes.as_ptr(); - Buffer { - data: Arc::new(bytes), - ptr, - length, - } + Self::from(bytes) } /// Returns the offset, in bytes, of `Self::ptr` to `Self::data` @@ -107,8 +133,11 @@ impl Buffer { buffer.into() } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting - /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. + /// Creates a buffer from an existing memory region. + /// + /// Ownership of the memory is tracked via reference counting + /// and the memory will be freed using the `drop` method of + /// [crate::alloc::Allocation] when the reference count reaches zero. /// /// # Arguments /// @@ -155,7 +184,7 @@ impl Buffer { self.data.capacity() } - /// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory. + /// Tries to shrink the capacity of the buffer as much as possible, freeing unused memory. /// /// If the buffer is shared, this is a no-op. /// @@ -190,7 +219,7 @@ impl Buffer { } } - /// Returns whether the buffer is empty. + /// Returns true if the buffer is empty. #[inline] pub fn is_empty(&self) -> bool { self.length == 0 @@ -206,7 +235,9 @@ impl Buffer { } /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data, allowing the + /// same memory region to be shared between buffers. /// /// # Panics /// @@ -240,7 +271,10 @@ impl Buffer { /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`, /// with `length` bytes. - /// Doing so allows the same memory region to be shared between buffers. + /// + /// This function is `O(1)` and does not copy any data, allowing the same + /// memory region to be shared between buffers. + /// /// # Panics /// Panics iff `(offset + length)` is larger than the existing length. pub fn slice_with_length(&self, offset: usize, length: usize) -> Self { @@ -328,10 +362,16 @@ impl Buffer { }) } - /// Returns `Vec` for mutating the buffer + /// Converts self into a `Vec`, if possible. + /// + /// This can be used to reuse / mutate the underlying data. /// - /// Returns `Err(self)` if this buffer does not have the same [`Layout`] as - /// the destination Vec or contains a non-zero offset + /// # Errors + /// + /// Returns `Err(self)` if + /// 1. this buffer does not have the same [`Layout`] as the destination Vec + /// 2. contains a non-zero offset + /// 3. The buffer is shared pub fn into_vec(self) -> Result, Self> { let layout = match self.data.deallocation() { Deallocation::Standard(l) => l, @@ -414,7 +454,29 @@ impl From> for Buffer { } } -/// Creating a `Buffer` instance by storing the boolean values into the buffer +/// Convert from internal `Bytes` (not [`bytes::Bytes`]) to `Buffer` +impl From for Buffer { + #[inline] + fn from(bytes: Bytes) -> Self { + let length = bytes.len(); + let ptr = bytes.as_ptr(); + Self { + data: Arc::new(bytes), + ptr, + length, + } + } +} + +/// Convert from [`bytes::Bytes`], not internal `Bytes` to `Buffer` +impl From for Buffer { + fn from(bytes: bytes::Bytes) -> Self { + let bytes: Bytes = bytes.into(); + Self::from(bytes) + } +} + +/// Create a `Buffer` instance by storing the boolean values into the buffer impl FromIterator for Buffer { fn from_iter(iter: I) -> Self where @@ -447,7 +509,9 @@ impl From> for Buffer { impl Buffer { /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length. + /// /// Prefer this to `collect` whenever possible, as it is ~60% faster. + /// /// # Example /// ``` /// # use arrow_buffer::buffer::Buffer; diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index c4315a1d64cd..5ad55e306e2a 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -328,7 +328,7 @@ impl MutableBuffer { pub(super) fn into_buffer(self) -> Buffer { let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) }; std::mem::forget(self); - Buffer::from_bytes(bytes) + Buffer::from(bytes) } /// View this buffer as a mutable slice of a specific type. diff --git a/arrow-buffer/src/bytes.rs b/arrow-buffer/src/bytes.rs index 77724137aef7..b811bd2c6b40 100644 --- a/arrow-buffer/src/bytes.rs +++ b/arrow-buffer/src/bytes.rs @@ -28,14 +28,18 @@ use crate::buffer::dangling_ptr; /// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. /// -/// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's -/// global allocator nor u8 alignment. +/// Note that this structure is an internal implementation detail of the +/// arrow-rs crate. While it has the same name and similar API as +/// [`bytes::Bytes`] it is not limited to rust's global allocator nor u8 +/// alignment. It is possible to create a `Bytes` from `bytes::Bytes` using the +/// `From` implementation. /// /// In the most common case, this buffer is allocated using [`alloc`](std::alloc::alloc) /// with an alignment of [`ALIGNMENT`](crate::alloc::ALIGNMENT) /// /// When the region is allocated by a different allocator, [Deallocation::Custom], this calls the /// custom deallocator to deallocate the region when it is no longer needed. +/// pub struct Bytes { /// The raw pointer to be beginning of the region ptr: NonNull, diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs index 7bafc384306b..760fc926fca6 100644 --- a/arrow-flight/src/decode.rs +++ b/arrow-flight/src/decode.rs @@ -295,7 +295,7 @@ impl FlightDataDecoder { )); }; - let buffer = Buffer::from_bytes(data.data_body.into()); + let buffer = Buffer::from(data.data_body); let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| { FlightError::protocol( "Could not get dictionary batch from DictionaryBatch message", diff --git a/arrow-flight/src/sql/client.rs b/arrow-flight/src/sql/client.rs index a6e228737b3f..6d3ac3dbe610 100644 --- a/arrow-flight/src/sql/client.rs +++ b/arrow-flight/src/sql/client.rs @@ -721,7 +721,7 @@ pub fn arrow_data_from_flight_data( let dictionaries_by_field = HashMap::new(); let record_batch = read_record_batch( - &Buffer::from_bytes(flight_data.data_body.into()), + &Buffer::from(flight_data.data_body), ipc_record_batch, arrow_schema_ref.clone(), &dictionaries_by_field, diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 5845e2c08cec..92a8b0592d0d 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -316,9 +316,8 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let buf = arrow_buffer::Buffer::from_bytes(self.buf.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` + let buf = arrow_buffer::Buffer::from(self.buf.clone()); let block_id = output.append_block(buf); let to_read = len.min(self.max_remaining_values); @@ -549,9 +548,8 @@ impl ByteViewArrayDecoderDeltaLength { let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read]; - // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy - // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy - let bytes = arrow_buffer::Buffer::from_bytes(self.data.clone().into()); + // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` + let bytes = Buffer::from(self.data.clone()); let block_id = output.append_block(bytes); let mut current_offset = self.data_offset; From 618d81ce1f3bd7591ae0c40be19065e13d3d68d6 Mon Sep 17 00:00:00 2001 From: Jinpeng Date: Mon, 6 Jan 2025 16:09:05 -0500 Subject: [PATCH 36/68] Convert some panics that happen on invalid parquet files to error results (#6738) * Reduce panics * t pushmove integer logical type from format.rs to schema type.rs * remove some changes as per reviews * use wrapping_shl * fix typo in error message * return error for invalid decimal length --------- Co-authored-by: jp0317 Co-authored-by: Andrew Lamb --- parquet/src/errors.rs | 7 ++++ parquet/src/file/metadata/reader.rs | 26 ++++++------- parquet/src/file/serialized_reader.rs | 53 ++++++++++++++++++++++---- parquet/src/file/statistics.rs | 26 +++++++++++++ parquet/src/schema/types.rs | 25 +++++++++++- parquet/src/thrift.rs | 35 ++++++++++++++--- parquet/tests/arrow_reader/bad_data.rs | 2 +- 7 files changed, 146 insertions(+), 28 deletions(-) diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index 8dc97f4ca2e6..d749287bba62 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -17,6 +17,7 @@ //! Common Parquet errors and macros. +use core::num::TryFromIntError; use std::error::Error; use std::{cell, io, result, str}; @@ -81,6 +82,12 @@ impl Error for ParquetError { } } +impl From for ParquetError { + fn from(e: TryFromIntError) -> ParquetError { + ParquetError::General(format!("Integer overflow: {e}")) + } +} + impl From for ParquetError { fn from(e: io::Error) -> ParquetError { ParquetError::External(Box::new(e)) diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index ec2cd1094d3a..c6715a33b5ae 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -627,7 +627,8 @@ impl ParquetMetaDataReader { for rg in t_file_metadata.row_groups { row_groups.push(RowGroupMetaData::from_thrift(schema_descr.clone(), rg)?); } - let column_orders = Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr); + let column_orders = + Self::parse_column_orders(t_file_metadata.column_orders, &schema_descr)?; let file_metadata = FileMetaData::new( t_file_metadata.version, @@ -645,15 +646,13 @@ impl ParquetMetaDataReader { fn parse_column_orders( t_column_orders: Option>, schema_descr: &SchemaDescriptor, - ) -> Option> { + ) -> Result>> { match t_column_orders { Some(orders) => { // Should always be the case - assert_eq!( - orders.len(), - schema_descr.num_columns(), - "Column order length mismatch" - ); + if orders.len() != schema_descr.num_columns() { + return Err(general_err!("Column order length mismatch")); + }; let mut res = Vec::new(); for (i, column) in schema_descr.columns().iter().enumerate() { match orders[i] { @@ -667,9 +666,9 @@ impl ParquetMetaDataReader { } } } - Some(res) + Ok(Some(res)) } - None => None, + None => Ok(None), } } } @@ -741,7 +740,7 @@ mod tests { ]); assert_eq!( - ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr), + ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr).unwrap(), Some(vec![ ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED), ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED) @@ -750,20 +749,21 @@ mod tests { // Test when no column orders are defined. assert_eq!( - ParquetMetaDataReader::parse_column_orders(None, &schema_descr), + ParquetMetaDataReader::parse_column_orders(None, &schema_descr).unwrap(), None ); } #[test] - #[should_panic(expected = "Column order length mismatch")] fn test_metadata_column_orders_len_mismatch() { let schema = SchemaType::group_type_builder("schema").build().unwrap(); let schema_descr = SchemaDescriptor::new(Arc::new(schema)); let t_column_orders = Some(vec![TColumnOrder::TYPEORDER(TypeDefinedOrder::new())]); - ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); + let res = ParquetMetaDataReader::parse_column_orders(t_column_orders, &schema_descr); + assert!(res.is_err()); + assert!(format!("{:?}", res.unwrap_err()).contains("Column order length mismatch")); } #[test] diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 06f3cf9fb23f..a942481f7e4d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -435,7 +435,7 @@ pub(crate) fn decode_page( let is_sorted = dict_header.is_sorted.unwrap_or(false); Page::DictionaryPage { buf: buffer, - num_values: dict_header.num_values as u32, + num_values: dict_header.num_values.try_into()?, encoding: Encoding::try_from(dict_header.encoding)?, is_sorted, } @@ -446,7 +446,7 @@ pub(crate) fn decode_page( .ok_or_else(|| ParquetError::General("Missing V1 data page header".to_string()))?; Page::DataPage { buf: buffer, - num_values: header.num_values as u32, + num_values: header.num_values.try_into()?, encoding: Encoding::try_from(header.encoding)?, def_level_encoding: Encoding::try_from(header.definition_level_encoding)?, rep_level_encoding: Encoding::try_from(header.repetition_level_encoding)?, @@ -460,12 +460,12 @@ pub(crate) fn decode_page( let is_compressed = header.is_compressed.unwrap_or(true); Page::DataPageV2 { buf: buffer, - num_values: header.num_values as u32, + num_values: header.num_values.try_into()?, encoding: Encoding::try_from(header.encoding)?, - num_nulls: header.num_nulls as u32, - num_rows: header.num_rows as u32, - def_levels_byte_len: header.definition_levels_byte_length as u32, - rep_levels_byte_len: header.repetition_levels_byte_length as u32, + num_nulls: header.num_nulls.try_into()?, + num_rows: header.num_rows.try_into()?, + def_levels_byte_len: header.definition_levels_byte_length.try_into()?, + rep_levels_byte_len: header.repetition_levels_byte_length.try_into()?, is_compressed, statistics: statistics::from_thrift(physical_type, header.statistics)?, } @@ -578,6 +578,27 @@ impl Iterator for SerializedPageReader { } } +fn verify_page_header_len(header_len: usize, remaining_bytes: usize) -> Result<()> { + if header_len > remaining_bytes { + return Err(eof_err!("Invalid page header")); + } + Ok(()) +} + +fn verify_page_size( + compressed_size: i32, + uncompressed_size: i32, + remaining_bytes: usize, +) -> Result<()> { + // The page's compressed size should not exceed the remaining bytes that are + // available to read. The page's uncompressed size is the expected size + // after decompression, which can never be negative. + if compressed_size < 0 || compressed_size as usize > remaining_bytes || uncompressed_size < 0 { + return Err(eof_err!("Invalid page header")); + } + Ok(()) +} + impl PageReader for SerializedPageReader { fn get_next_page(&mut self) -> Result> { loop { @@ -596,10 +617,16 @@ impl PageReader for SerializedPageReader { *header } else { let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining)?; *offset += header_len; *remaining -= header_len; header }; + verify_page_size( + header.compressed_page_size, + header.uncompressed_page_size, + *remaining, + )?; let data_len = header.compressed_page_size as usize; *offset += data_len; *remaining -= data_len; @@ -683,6 +710,7 @@ impl PageReader for SerializedPageReader { } else { let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining_bytes)?; *offset += header_len; *remaining_bytes -= header_len; let page_meta = if let Ok(page_meta) = (&header).try_into() { @@ -733,12 +761,23 @@ impl PageReader for SerializedPageReader { next_page_header, } => { if let Some(buffered_header) = next_page_header.take() { + verify_page_size( + buffered_header.compressed_page_size, + buffered_header.uncompressed_page_size, + *remaining_bytes, + )?; // The next page header has already been peeked, so just advance the offset *offset += buffered_header.compressed_page_size as usize; *remaining_bytes -= buffered_header.compressed_page_size as usize; } else { let mut read = self.reader.get_read(*offset as u64)?; let (header_len, header) = read_page_header_len(&mut read)?; + verify_page_header_len(header_len, *remaining_bytes)?; + verify_page_size( + header.compressed_page_size, + header.uncompressed_page_size, + *remaining_bytes, + )?; let data_page_size = header.compressed_page_size as usize; *offset += header_len + data_page_size; *remaining_bytes -= header_len + data_page_size; diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 2e05b83369cf..b7522a76f0fc 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -157,6 +157,32 @@ pub fn from_thrift( stats.max_value }; + fn check_len(min: &Option>, max: &Option>, len: usize) -> Result<()> { + if let Some(min) = min { + if min.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse min statistic".to_string(), + )); + } + } + if let Some(max) = max { + if max.len() < len { + return Err(ParquetError::General( + "Insufficient bytes to parse max statistic".to_string(), + )); + } + } + Ok(()) + } + + match physical_type { + Type::BOOLEAN => check_len(&min, &max, 1), + Type::INT32 | Type::FLOAT => check_len(&min, &max, 4), + Type::INT64 | Type::DOUBLE => check_len(&min, &max, 8), + Type::INT96 => check_len(&min, &max, 12), + _ => Ok(()), + }?; + // Values are encoded using PLAIN encoding definition, except that // variable-length byte arrays do not include a length prefix. // diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index d168e46de047..d9e9b22e809f 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -556,7 +556,11 @@ impl<'a> PrimitiveTypeBuilder<'a> { } } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - let max_precision = (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32; + let length = self + .length + .checked_mul(8) + .ok_or(general_err!("Invalid length {} for Decimal", self.length))?; + let max_precision = (2f64.powi(length - 1) - 1f64).log10().floor() as i32; if self.precision > max_precision { return Err(general_err!( @@ -1171,9 +1175,25 @@ pub fn from_thrift(elements: &[SchemaElement]) -> Result { )); } + if !schema_nodes[0].is_group() { + return Err(general_err!("Expected root node to be a group type")); + } + Ok(schema_nodes.remove(0)) } +/// Checks if the logical type is valid. +fn check_logical_type(logical_type: &Option) -> Result<()> { + if let Some(LogicalType::Integer { bit_width, .. }) = *logical_type { + if bit_width != 8 && bit_width != 16 && bit_width != 32 && bit_width != 64 { + return Err(general_err!( + "Bit width must be 8, 16, 32, or 64 for Integer logical type" + )); + } + } + Ok(()) +} + /// Constructs a new Type from the `elements`, starting at index `index`. /// The first result is the starting index for the next Type after this one. If it is /// equal to `elements.len()`, then this Type is the last one. @@ -1198,6 +1218,9 @@ fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize .logical_type .as_ref() .map(|value| LogicalType::from(value.clone())); + + check_logical_type(&logical_type)?; + let field_id = elements[index].field_id; match elements[index].num_children { // From parquet-format: diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index ceb6b1c29fe8..b216fec6f3e7 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -67,7 +67,7 @@ impl<'a> TCompactSliceInputProtocol<'a> { let mut shift = 0; loop { let byte = self.read_byte()?; - in_progress |= ((byte & 0x7F) as u64) << shift; + in_progress |= ((byte & 0x7F) as u64).wrapping_shl(shift); shift += 7; if byte & 0x80 == 0 { return Ok(in_progress); @@ -96,13 +96,22 @@ impl<'a> TCompactSliceInputProtocol<'a> { } } +macro_rules! thrift_unimplemented { + () => { + Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::NotImplemented, + message: "not implemented".to_string(), + })) + }; +} + impl TInputProtocol for TCompactSliceInputProtocol<'_> { fn read_message_begin(&mut self) -> thrift::Result { unimplemented!() } fn read_message_end(&mut self) -> thrift::Result<()> { - unimplemented!() + thrift_unimplemented!() } fn read_struct_begin(&mut self) -> thrift::Result> { @@ -147,7 +156,21 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> { ), _ => { if field_delta != 0 { - self.last_read_field_id += field_delta as i16; + self.last_read_field_id = self + .last_read_field_id + .checked_add(field_delta as i16) + .map_or_else( + || { + Err(thrift::Error::Protocol(thrift::ProtocolError { + kind: thrift::ProtocolErrorKind::InvalidData, + message: format!( + "cannot add {} to {}", + field_delta, self.last_read_field_id + ), + })) + }, + Ok, + )?; } else { self.last_read_field_id = self.read_i16()?; }; @@ -226,15 +249,15 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> { } fn read_set_begin(&mut self) -> thrift::Result { - unimplemented!() + thrift_unimplemented!() } fn read_set_end(&mut self) -> thrift::Result<()> { - unimplemented!() + thrift_unimplemented!() } fn read_map_begin(&mut self) -> thrift::Result { - unimplemented!() + thrift_unimplemented!() } fn read_map_end(&mut self) -> thrift::Result<()> { diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index 74342031432a..cfd61e82d32b 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -106,7 +106,7 @@ fn test_arrow_rs_gh_6229_dict_header() { let err = read_file("ARROW-RS-GH-6229-DICTHEADER.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: EOF: eof decoding byte array" + "External: Parquet argument error: Parquet error: Integer overflow: out of range integral type conversion attempted" ); } From 1be307761fbf4f3f7ced48e16169b0ad77287bbe Mon Sep 17 00:00:00 2001 From: June <61218022+itsjunetime@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:13:56 -0700 Subject: [PATCH 37/68] Update MSRVs to be accurate (#6742) * Update most MSRVs * Make cargo-msrv verify every package in repo instead of just a select few and purposefully break arrow-flight msrv * Add test to ensure workspace rust version is being used at least somewhere * Fix exit1 => exit 1 * Make arrow-flight work, at the very least, with 'cargo metadata' * Fix arrow-flight/gen rust-version to make CI pass now * Get rid of pretty msrv logging as it can't all be displayed * Do '-mindepth 2' with find to prevent running cargo msrv on the workspace as a whole * Use correct MSRV for object_store * remove workspace msrv check * revert msrv * push object_store MSRV back down to 1.62.1 * Revert unrelated formatting changes * Fix object_store msrv --------- Co-authored-by: Andrew Lamb Co-authored-by: Jeffrey Vo --- .github/workflows/rust.yml | 28 +++++--------------- Cargo.toml | 2 +- arrow-flight/gen/Cargo.toml | 2 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow-schema/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 8 files changed, 14 insertions(+), 28 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 044250b70435..ca0d2441ceae 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -123,23 +123,6 @@ jobs: uses: ./.github/actions/setup-builder - name: Install cargo-msrv run: cargo install cargo-msrv - - name: Downgrade arrow dependencies - run: cargo update -p ahash --precise 0.8.7 - - name: Check arrow - working-directory: arrow - run: | - # run `cd arrow; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - - name: Check parquet - working-directory: parquet - run: | - # run `cd parquet; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - - name: Check arrow-flight - working-directory: arrow-flight - run: | - # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json - name: Downgrade object_store dependencies working-directory: object_store # Necessary because tokio 1.30.0 updates MSRV to 1.63 @@ -147,8 +130,11 @@ jobs: run: | cargo update -p tokio --precise 1.29.1 cargo update -p url --precise 2.5.0 - - name: Check object_store - working-directory: object_store + - name: Check all packages run: | - # run `cd object_store; cargo msrv verify` to see problematic dependencies - cargo msrv verify --output-format=json + # run `cargo msrv verify --manifest-path "path/to/Cargo.toml"` to see problematic dependencies + find . -mindepth 2 -name Cargo.toml | while read -r dir + do + echo "Checking package '$dir'" + cargo msrv verify --manifest-path "$dir" --output-format=json || exit 1 + done diff --git a/Cargo.toml b/Cargo.toml index 75ba410f12a6..39e3c0bca99a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ include = [ "Cargo.toml", ] edition = "2021" -rust-version = "1.62" +rust-version = "1.70" [workspace.dependencies] arrow = { version = "54.0.0", path = "./arrow", default-features = false } diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 6358227a8912..e52efbf67e21 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen" description = "Code generation for arrow-flight" version = "0.1.0" edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.71.1" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 8654b4b92734..26cb05fae1c2 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = { workspace = true } license = { workspace = true } edition = { workspace = true } publish = false -rust-version = { workspace = true } +rust-version = "1.75.0" [lib] crate-type = ["lib", "cdylib"] diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 03d08df30959..4ead95fcb912 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" -rust-version = "1.62" +rust-version = "1.70" publish = false [lib] diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 1e1f9fbde0e4..d1bcf046b7ca 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -26,7 +26,7 @@ license = { workspace = true } keywords = { workspace = true } include = { workspace = true } edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.64" [lib] name = "arrow_schema" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 8860cd61c5b3..a1c9c0ab2113 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -31,7 +31,7 @@ include = [ "Cargo.toml", ] edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [lib] name = "arrow" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 19f890710778..e4085472ea20 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -26,7 +26,7 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } From 70e105403922e837629f0a9edda43e02f789d32d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 8 Jan 2025 09:02:23 -0500 Subject: [PATCH 38/68] Document the `ParquetRecordBatchStream` buffering (#6947) * Document the ParquetRecordBatchStream buffering * Update parquet/src/arrow/async_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/async_reader/mod.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 4f3befe42662..5323251b07e7 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -611,11 +611,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] -/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// An asynchronous [`Stream`]of [`RecordBatch`] constructed using [`ParquetRecordBatchStreamBuilder`] to read parquet files. /// /// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, /// allowing users to decode record batches separately from I/O. +/// +/// # I/O Buffering +/// +/// `ParquetRecordBatchStream` buffers *all* data pages selected after predicates +/// (projection + filtering, etc) and decodes the rows from those buffered pages. +/// +/// For example, if all rows and columns are selected, the entire row group is +/// buffered in memory during decode. This minimizes the number of IO operations +/// required, which is especially important for object stores, where IO operations +/// have latencies in the hundreds of milliseconds +/// +/// +/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html pub struct ParquetRecordBatchStream { metadata: Arc, From 06b4b8f088bcc40f7d372dfaa69daab740cbb558 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 8 Jan 2025 06:06:13 -0800 Subject: [PATCH 39/68] Return `BoxStream` with `'static` lifetime from `ObjectStore::list` (#6619) Co-authored-by: Andrew Lamb --- object_store/src/aws/client.rs | 2 +- object_store/src/aws/mod.rs | 4 +-- object_store/src/azure/client.rs | 2 +- object_store/src/azure/mod.rs | 7 ++-- object_store/src/chunked.rs | 4 +-- object_store/src/client/list.rs | 19 +++++----- object_store/src/client/pagination.rs | 50 ++++++++++++++++----------- object_store/src/gcp/client.rs | 2 +- object_store/src/gcp/mod.rs | 4 +-- object_store/src/http/mod.rs | 15 ++++---- object_store/src/lib.rs | 8 ++--- object_store/src/limit.rs | 14 ++++---- object_store/src/local.rs | 2 +- object_store/src/memory.rs | 2 +- object_store/src/prefix.rs | 32 ++++++++++++++--- object_store/src/throttle.rs | 16 +++++---- object_store/tests/get_range_file.rs | 2 +- 17 files changed, 113 insertions(+), 72 deletions(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index b81be0c0efad..246f2779dd07 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -855,7 +855,7 @@ impl GetClient for S3Client { } #[async_trait] -impl ListClient for S3Client { +impl ListClient for Arc { /// Make an S3 List request async fn list_request( &self, diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index 7f449c49963c..82ef909de984 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -273,7 +273,7 @@ impl ObjectStore for AmazonS3 { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -281,7 +281,7 @@ impl ObjectStore for AmazonS3 { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { if self.client.config.is_s3_express() { let offset = offset.clone(); // S3 Express does not support start-after diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index bd72d0c6aee1..fa5412c455fc 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -925,7 +925,7 @@ impl GetClient for AzureClient { } #[async_trait] -impl ListClient for AzureClient { +impl ListClient for Arc { /// Make an Azure List request async fn list_request( &self, diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 81b6667bc058..ea4dd8f567a9 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -119,6 +119,9 @@ impl ObjectStore for MicrosoftAzure { self.client.delete_request(location, &()).await } + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { + self.client.list(prefix) + } fn delete_stream<'a>( &'a self, locations: BoxStream<'a, Result>, @@ -139,10 +142,6 @@ impl ObjectStore for MicrosoftAzure { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { - self.client.list(prefix) - } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } diff --git a/object_store/src/chunked.rs b/object_store/src/chunked.rs index 3f83c1336dc4..4998e9f2a04d 100644 --- a/object_store/src/chunked.rs +++ b/object_store/src/chunked.rs @@ -150,7 +150,7 @@ impl ObjectStore for ChunkedStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.inner.list(prefix) } @@ -158,7 +158,7 @@ impl ObjectStore for ChunkedStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.inner.list_with_offset(prefix, offset) } diff --git a/object_store/src/client/list.rs b/object_store/src/client/list.rs index 4445d0d17533..fe9bfebf768d 100644 --- a/object_store/src/client/list.rs +++ b/object_store/src/client/list.rs @@ -44,37 +44,38 @@ pub(crate) trait ListClientExt { prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; #[allow(unused)] fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result>; + ) -> BoxStream<'static, Result>; async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result; } #[async_trait] -impl ListClientExt for T { +impl ListClientExt for T { fn list_paginated( &self, prefix: Option<&Path>, delimiter: bool, offset: Option<&Path>, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.map(|x| x.to_string()); let prefix = prefix .filter(|x| !x.as_ref().is_empty()) .map(|p| format!("{}{}", p.as_ref(), crate::path::DELIMITER)); stream_paginated( + self.clone(), (prefix, offset), - move |(prefix, offset), token| async move { - let (r, next_token) = self + move |client, (prefix, offset), token| async move { + let (r, next_token) = client .list_request( prefix.as_deref(), delimiter, @@ -88,7 +89,7 @@ impl ListClientExt for T { .boxed() } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, None) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() @@ -99,7 +100,7 @@ impl ListClientExt for T { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.list_paginated(prefix, false, Some(offset)) .map_ok(|r| futures::stream::iter(r.objects.into_iter().map(Ok))) .try_flatten() diff --git a/object_store/src/client/pagination.rs b/object_store/src/client/pagination.rs index 77b2a3d8e2f2..d789c7431d8c 100644 --- a/object_store/src/client/pagination.rs +++ b/object_store/src/client/pagination.rs @@ -35,9 +35,14 @@ use std::future::Future; /// finish, otherwise it will continue to call `op(state, token)` with the values returned by the /// previous call to `op`, until a continuation token of `None` is returned /// -pub(crate) fn stream_paginated(state: S, op: F) -> impl Stream> +pub(crate) fn stream_paginated( + client: C, + state: S, + op: F, +) -> impl Stream> where - F: Fn(S, Option) -> Fut + Copy, + C: Clone, + F: Fn(C, S, Option) -> Fut + Copy, Fut: Future)>>, { enum PaginationState { @@ -46,27 +51,30 @@ where Done, } - futures::stream::unfold(PaginationState::Start(state), move |state| async move { - let (s, page_token) = match state { - PaginationState::Start(s) => (s, None), - PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { - (s, Some(page_token)) - } - _ => { - return None; - } - }; + futures::stream::unfold(PaginationState::Start(state), move |state| { + let client = client.clone(); + async move { + let (s, page_token) = match state { + PaginationState::Start(s) => (s, None), + PaginationState::HasMore(s, page_token) if !page_token.is_empty() => { + (s, Some(page_token)) + } + _ => { + return None; + } + }; - let (resp, s, continuation) = match op(s, page_token).await { - Ok(resp) => resp, - Err(e) => return Some((Err(e), PaginationState::Done)), - }; + let (resp, s, continuation) = match op(client, s, page_token).await { + Ok(resp) => resp, + Err(e) => return Some((Err(e), PaginationState::Done)), + }; - let next_state = match continuation { - Some(token) => PaginationState::HasMore(s, token), - None => PaginationState::Done, - }; + let next_state = match continuation { + Some(token) => PaginationState::HasMore(s, token), + None => PaginationState::Done, + }; - Some((Ok(resp), next_state)) + Some((Ok(resp), next_state)) + } }) } diff --git a/object_store/src/gcp/client.rs b/object_store/src/gcp/client.rs index d6f89ca71740..8dd1c69802a8 100644 --- a/object_store/src/gcp/client.rs +++ b/object_store/src/gcp/client.rs @@ -633,7 +633,7 @@ impl GetClient for GoogleCloudStorageClient { } #[async_trait] -impl ListClient for GoogleCloudStorageClient { +impl ListClient for Arc { /// Perform a list request async fn list_request( &self, diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 5199135ba6b0..a2f512415a8d 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -183,7 +183,7 @@ impl ObjectStore for GoogleCloudStorage { self.client.delete_request(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.client.list(prefix) } @@ -191,7 +191,7 @@ impl ObjectStore for GoogleCloudStorage { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.client.list_with_offset(prefix, offset) } diff --git a/object_store/src/http/mod.rs b/object_store/src/http/mod.rs index 417f72856722..899740d36db9 100644 --- a/object_store/src/http/mod.rs +++ b/object_store/src/http/mod.rs @@ -31,6 +31,8 @@ //! [rfc2518]: https://datatracker.ietf.org/doc/html/rfc2518 //! [WebDAV]: https://en.wikipedia.org/wiki/WebDAV +use std::sync::Arc; + use async_trait::async_trait; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; @@ -79,7 +81,7 @@ impl From for crate::Error { /// See [`crate::http`] for more information #[derive(Debug)] pub struct HttpStore { - client: Client, + client: Arc, } impl std::fmt::Display for HttpStore { @@ -130,19 +132,20 @@ impl ObjectStore for HttpStore { self.client.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix_len = prefix.map(|p| p.as_ref().len()).unwrap_or_default(); let prefix = prefix.cloned(); + let client = Arc::clone(&self.client); futures::stream::once(async move { - let status = self.client.list(prefix.as_ref(), "infinity").await?; + let status = client.list(prefix.as_ref(), "infinity").await?; let iter = status .response .into_iter() .filter(|r| !r.is_dir()) - .map(|response| { + .map(move |response| { response.check_ok()?; - response.object_meta(self.client.base_url()) + response.object_meta(client.base_url()) }) // Filter out exact prefix matches .filter_ok(move |r| r.location.as_ref().len() > prefix_len); @@ -238,7 +241,7 @@ impl HttpBuilder { let parsed = Url::parse(&url).map_err(|source| Error::UnableToParseUrl { url, source })?; Ok(HttpStore { - client: Client::new(parsed, self.client_options, self.retry_config)?, + client: Arc::new(Client::new(parsed, self.client_options, self.retry_config)?), }) } } diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 987ffacc6e49..53eda5a82fd5 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -722,7 +722,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// `foo/bar_baz/x`. List is recursive, i.e. `foo/bar/more/x` will be included. /// /// Note: the order of returned [`ObjectMeta`] is not guaranteed - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result>; + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result>; /// List all the objects with the given prefix and a location greater than `offset` /// @@ -734,7 +734,7 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = offset.clone(); self.list(prefix) .try_filter(move |f| futures::future::ready(f.location > offset)) @@ -847,7 +847,7 @@ macro_rules! as_ref_impl { self.as_ref().delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { self.as_ref().list(prefix) } @@ -855,7 +855,7 @@ macro_rules! as_ref_impl { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { self.as_ref().list_with_offset(prefix, offset) } diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs index 6a3c3b574e62..77f72a0e11a1 100644 --- a/object_store/src/limit.rs +++ b/object_store/src/limit.rs @@ -45,7 +45,7 @@ use tokio::sync::{OwnedSemaphorePermit, Semaphore}; /// #[derive(Debug)] pub struct LimitStore { - inner: T, + inner: Arc, max_requests: usize, semaphore: Arc, } @@ -56,7 +56,7 @@ impl LimitStore { /// `max_requests` pub fn new(inner: T, max_requests: usize) -> Self { Self { - inner, + inner: Arc::new(inner), max_requests, semaphore: Arc::new(Semaphore::new(max_requests)), } @@ -144,12 +144,13 @@ impl ObjectStore for LimitStore { self.inner.delete_stream(locations) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list(prefix.as_ref()); + let s = inner.list(prefix.as_ref()); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() @@ -159,13 +160,14 @@ impl ObjectStore for LimitStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let prefix = prefix.cloned(); let offset = offset.clone(); + let inner = Arc::clone(&self.inner); let fut = Arc::clone(&self.semaphore) .acquire_owned() .map(move |permit| { - let s = self.inner.list_with_offset(prefix.as_ref(), &offset); + let s = inner.list_with_offset(prefix.as_ref(), &offset); PermitWrapper::new(s, permit.unwrap()) }); fut.into_stream().flatten().boxed() diff --git a/object_store/src/local.rs b/object_store/src/local.rs index b193481ae7b8..364026459a03 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -488,7 +488,7 @@ impl ObjectStore for LocalFileSystem { .await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let config = Arc::clone(&self.config); let root_path = match prefix { diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 3f3cff3390db..6402f924346f 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -297,7 +297,7 @@ impl ObjectStore for InMemory { Ok(()) } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let root = Path::default(); let prefix = prefix.unwrap_or(&root); diff --git a/object_store/src/prefix.rs b/object_store/src/prefix.rs index 227887d78fd7..a0b67ca4b58e 100644 --- a/object_store/src/prefix.rs +++ b/object_store/src/prefix.rs @@ -74,6 +74,28 @@ impl PrefixStore { } } +// Note: This is a relative hack to move these two functions to pure functions so they don't rely +// on the `self` lifetime. Expected to be cleaned up before merge. +// +/// Strip the constant prefix from a given path +fn strip_prefix(prefix: &Path, path: Path) -> Path { + // Note cannot use match because of borrow checker + if let Some(suffix) = path.prefix_match(prefix) { + return suffix.collect(); + } + path +} + +/// Strip the constant prefix from a given ObjectMeta +fn strip_meta(prefix: &Path, meta: ObjectMeta) -> ObjectMeta { + ObjectMeta { + last_modified: meta.last_modified, + size: meta.size, + location: strip_prefix(prefix, meta.location), + e_tag: meta.e_tag, + version: None, + } +} #[async_trait::async_trait] impl ObjectStore for PrefixStore { async fn put(&self, location: &Path, payload: PutPayload) -> Result { @@ -136,21 +158,23 @@ impl ObjectStore for PrefixStore { self.inner.delete(&full_path).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list(Some(&prefix)); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } fn list_with_offset( &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let offset = self.full_path(offset); let prefix = self.full_path(prefix.unwrap_or(&Path::default())); let s = self.inner.list_with_offset(Some(&prefix), &offset); - s.map_ok(|meta| self.strip_meta(meta)).boxed() + let slf_prefix = self.prefix.clone(); + s.map_ok(move |meta| strip_meta(&slf_prefix, meta)).boxed() } async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index b9dff5c6d1d2..29cd32705ccc 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -237,11 +237,13 @@ impl ObjectStore for ThrottledStore { self.inner.delete(location).await } - fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { let stream = self.inner.list(prefix); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() @@ -252,11 +254,13 @@ impl ObjectStore for ThrottledStore { &self, prefix: Option<&Path>, offset: &Path, - ) -> BoxStream<'_, Result> { + ) -> BoxStream<'static, Result> { let stream = self.inner.list_with_offset(prefix, offset); + let config = Arc::clone(&self.config); futures::stream::once(async move { - let wait_list_per_entry = self.config().wait_list_per_entry; - sleep(self.config().wait_list_per_call).await; + let config = *config.lock(); + let wait_list_per_entry = config.wait_list_per_entry; + sleep(config.wait_list_per_call).await; throttle_stream(stream, move |_| wait_list_per_entry) }) .flatten() diff --git a/object_store/tests/get_range_file.rs b/object_store/tests/get_range_file.rs index c5550ac21728..e500fc8ac87d 100644 --- a/object_store/tests/get_range_file.rs +++ b/object_store/tests/get_range_file.rs @@ -62,7 +62,7 @@ impl ObjectStore for MyStore { todo!() } - fn list(&self, _: Option<&Path>) -> BoxStream<'_, Result> { + fn list(&self, _: Option<&Path>) -> BoxStream<'static, Result> { todo!() } From a89585dc889eface963423cb6420a197131bb061 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 8 Jan 2025 09:28:05 -0500 Subject: [PATCH 40/68] [Parquet] Reuse buffer in `ByteViewArrayDecoderPlain` (#6930) * reuse buffer in view array * Update parquet/src/arrow/array_reader/byte_view_array.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * use From instead --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- .../src/arrow/array_reader/byte_view_array.rs | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 92a8b0592d0d..0e16642940d2 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -290,7 +290,7 @@ impl ByteViewArrayDecoder { /// Decoder from [`Encoding::PLAIN`] data to [`ViewBuffer`] pub struct ByteViewArrayDecoderPlain { - buf: Bytes, + buf: Buffer, offset: usize, validate_utf8: bool, @@ -308,7 +308,7 @@ impl ByteViewArrayDecoderPlain { validate_utf8: bool, ) -> Self { Self { - buf, + buf: Buffer::from(buf), offset: 0, max_remaining_values: num_values.unwrap_or(num_levels), validate_utf8, @@ -316,9 +316,15 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - // Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer` - let buf = arrow_buffer::Buffer::from(self.buf.clone()); - let block_id = output.append_block(buf); + // avoid creating a new buffer if the last buffer is the same as the current buffer + // This is especially useful when row-level filtering is applied, where we call lots of small `read` over the same buffer. + let block_id = { + if output.buffers.last().is_some_and(|x| x.ptr_eq(&self.buf)) { + output.buffers.len() as u32 - 1 + } else { + output.append_block(self.buf.clone()) + } + }; let to_read = len.min(self.max_remaining_values); @@ -690,12 +696,13 @@ mod tests { use crate::{ arrow::{ - array_reader::test_util::{byte_array_all_encodings, utf8_column}, + array_reader::test_util::{byte_array_all_encodings, encode_byte_array, utf8_column}, buffer::view_buffer::ViewBuffer, record_reader::buffer::ValuesBuffer, }, basic::Encoding, column::reader::decoder::ColumnValueDecoder, + data_type::ByteArray, }; use super::*; @@ -746,4 +753,23 @@ mod tests { ); } } + + #[test] + fn test_byte_view_array_plain_decoder_reuse_buffer() { + let byte_array = vec!["hello", "world", "large payload over 12 bytes", "b"]; + let byte_array: Vec = byte_array.into_iter().map(|x| x.into()).collect(); + let pages = encode_byte_array(Encoding::PLAIN, &byte_array); + + let column_desc = utf8_column(); + let mut decoder = ByteViewArrayColumnValueDecoder::new(&column_desc); + + let mut view_buffer = ViewBuffer::default(); + decoder.set_data(Encoding::PLAIN, pages, 4, None).unwrap(); + decoder.read(&mut view_buffer, 1).unwrap(); + decoder.read(&mut view_buffer, 1).unwrap(); + assert_eq!(view_buffer.buffers.len(), 1); + + decoder.read(&mut view_buffer, 1).unwrap(); + assert_eq!(view_buffer.buffers.len(), 1); + } } From 6761baba64d3a7775af6feddda5e2799790df76c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 8 Jan 2025 09:54:39 -0800 Subject: [PATCH 41/68] regenerate arrow-ipc/src/gen with patched flatbuffers (#6426) * regenerate arrow-ipc/src/gen with patched flatbuffers * use git repo instead of local path * add backticks * expand allowed overage to accommodate more alignment padding * re-enable nanoarrow integration test * add assertions that struct alignment is correct * remove struct alignment assertions * apply a patch to generated code rather than requiring patched flatc * point to google/flatbuffers with pub PushAlignment * add license header to gen.patch * use flatbuffers 24.12.23 * remove unnecessary gen.patch --- .github/workflows/integration.yml | 3 +- arrow-flight/src/encode.rs | 14 +- arrow-ipc/Cargo.toml | 2 +- arrow-ipc/regen.sh | 90 +++---- arrow-ipc/src/gen/File.rs | 26 +- arrow-ipc/src/gen/Message.rs | 66 ++--- arrow-ipc/src/gen/Schema.rs | 397 +++++++++++++++--------------- arrow-ipc/src/gen/SparseTensor.rs | 182 +++++++++++--- arrow-ipc/src/gen/Tensor.rs | 150 +++++++++-- arrow-ipc/src/lib.rs | 11 + 10 files changed, 609 insertions(+), 332 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9b23b1b5ad2e..a47195d1becf 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -65,8 +65,7 @@ jobs: ARROW_INTEGRATION_JAVA: ON ARROW_INTEGRATION_JS: ON ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS: "rust" - # Disable nanoarrow integration, due to https://github.com/apache/arrow-rs/issues/5052 - ARCHERY_INTEGRATION_WITH_NANOARROW: "0" + ARCHERY_INTEGRATION_WITH_NANOARROW: "1" # https://github.com/apache/arrow/pull/38403/files#r1371281630 ARCHERY_INTEGRATION_WITH_RUST: "1" # These are necessary because the github runner overrides $HOME diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 19fe42474405..57ac9f3173fe 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1708,7 +1708,7 @@ mod tests { ]) .unwrap(); - verify_encoded_split(batch, 112).await; + verify_encoded_split(batch, 120).await; } #[tokio::test] @@ -1719,7 +1719,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4304).await; + verify_encoded_split(batch, 4312).await; } #[tokio::test] @@ -1755,7 +1755,7 @@ mod tests { // 5k over limit (which is 2x larger than limit of 5k) // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5800).await; + verify_encoded_split(batch, 5808).await; } #[tokio::test] @@ -1771,7 +1771,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 48).await; + verify_encoded_split(batch, 56).await; } #[tokio::test] @@ -1785,7 +1785,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 3328).await; + verify_encoded_split(batch, 3336).await; } #[tokio::test] @@ -1799,7 +1799,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 5280).await; + verify_encoded_split(batch, 5288).await; } #[tokio::test] @@ -1824,7 +1824,7 @@ mod tests { // overage is much higher than ideal // https://github.com/apache/arrow-rs/issues/3478 - verify_encoded_split(batch, 4128).await; + verify_encoded_split(batch, 4136).await; } /// Return size, in memory of flight data diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml index cf91b3a3415f..4988eed4a5ed 100644 --- a/arrow-ipc/Cargo.toml +++ b/arrow-ipc/Cargo.toml @@ -38,7 +38,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } -flatbuffers = { version = "24.3.25", default-features = false } +flatbuffers = { version = "24.12.23", default-features = false } lz4_flex = { version = "0.11", default-features = false, features = ["std", "frame"], optional = true } zstd = { version = "0.13.0", default-features = false, optional = true } diff --git a/arrow-ipc/regen.sh b/arrow-ipc/regen.sh index 8d8862ccc7f4..b368bd1bc7cc 100755 --- a/arrow-ipc/regen.sh +++ b/arrow-ipc/regen.sh @@ -21,33 +21,36 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" # Change to the toplevel `arrow-rs` directory pushd $DIR/../ -echo "Build flatc from source ..." - -FB_URL="https://github.com/google/flatbuffers" -FB_DIR="arrow/.flatbuffers" -FLATC="$FB_DIR/bazel-bin/flatc" - -if [ -z $(which bazel) ]; then - echo "bazel is required to build flatc" - exit 1 -fi - -echo "Bazel version: $(bazel version | head -1 | awk -F':' '{print $2}')" - -if [ ! -e $FB_DIR ]; then - echo "git clone $FB_URL ..." - git clone -b master --no-tag --depth 1 $FB_URL $FB_DIR +if [ -z "$FLATC" ]; then + echo "Build flatc from source ..." + + FB_URL="https://github.com/google/flatbuffers" + FB_DIR="arrow/.flatbuffers" + FLATC="$FB_DIR/bazel-bin/flatc" + + if [ -z $(which bazel) ]; then + echo "bazel is required to build flatc" + exit 1 + fi + + echo "Bazel version: $(bazel version | head -1 | awk -F':' '{print $2}')" + + if [ ! -e $FB_DIR ]; then + echo "git clone $FB_URL ..." + git clone -b master --no-tag --depth 1 $FB_URL $FB_DIR + else + echo "git pull $FB_URL ..." + git -C $FB_DIR pull + fi + + pushd $FB_DIR + echo "run: bazel build :flatc ..." + bazel build :flatc + popd else - echo "git pull $FB_URL ..." - git -C $FB_DIR pull + echo "Using flatc $FLATC ..." fi -pushd $FB_DIR -echo "run: bazel build :flatc ..." -bazel build :flatc -popd - - # Execute the code generation: $FLATC --filename-suffix "" --rust -o arrow-ipc/src/gen/ format/*.fbs @@ -99,37 +102,38 @@ for f in `ls *.rs`; do fi echo "Modifying: $f" - sed -i '' '/extern crate flatbuffers;/d' $f - sed -i '' '/use self::flatbuffers::EndianScalar;/d' $f - sed -i '' '/\#\[allow(unused_imports, dead_code)\]/d' $f - sed -i '' '/pub mod org {/d' $f - sed -i '' '/pub mod apache {/d' $f - sed -i '' '/pub mod arrow {/d' $f - sed -i '' '/pub mod flatbuf {/d' $f - sed -i '' '/} \/\/ pub mod flatbuf/d' $f - sed -i '' '/} \/\/ pub mod arrow/d' $f - sed -i '' '/} \/\/ pub mod apache/d' $f - sed -i '' '/} \/\/ pub mod org/d' $f - sed -i '' '/use core::mem;/d' $f - sed -i '' '/use core::cmp::Ordering;/d' $f - sed -i '' '/use self::flatbuffers::{EndianScalar, Follow};/d' $f + sed --in-place='' '/extern crate flatbuffers;/d' $f + sed --in-place='' '/use self::flatbuffers::EndianScalar;/d' $f + sed --in-place='' '/\#\[allow(unused_imports, dead_code)\]/d' $f + sed --in-place='' '/pub mod org {/d' $f + sed --in-place='' '/pub mod apache {/d' $f + sed --in-place='' '/pub mod arrow {/d' $f + sed --in-place='' '/pub mod flatbuf {/d' $f + sed --in-place='' '/} \/\/ pub mod flatbuf/d' $f + sed --in-place='' '/} \/\/ pub mod arrow/d' $f + sed --in-place='' '/} \/\/ pub mod apache/d' $f + sed --in-place='' '/} \/\/ pub mod org/d' $f + sed --in-place='' '/use core::mem;/d' $f + sed --in-place='' '/use core::cmp::Ordering;/d' $f + sed --in-place='' '/use self::flatbuffers::{EndianScalar, Follow};/d' $f # required by flatc 1.12.0+ - sed -i '' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f + sed --in-place='' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f for name in ${names[@]}; do - sed -i '' "/use crate::${name}::\*;/d" $f - sed -i '' "s/use self::flatbuffers::Verifiable;/use flatbuffers::Verifiable;/g" $f + sed --in-place='' "/use crate::${name}::\*;/d" $f + sed --in-place='' "s/use self::flatbuffers::Verifiable;/use flatbuffers::Verifiable;/g" $f done # Replace all occurrences of "type__" with "type_", "TYPE__" with "TYPE_". - sed -i '' 's/type__/type_/g' $f - sed -i '' 's/TYPE__/TYPE_/g' $f + sed --in-place='' 's/type__/type_/g' $f + sed --in-place='' 's/TYPE__/TYPE_/g' $f # Some files need prefixes if [[ $f == "File.rs" ]]; then # Now prefix the file with the static contents echo -e "${PREFIX}" "${SCHEMA_IMPORT}" | cat - $f > temp && mv temp $f elif [[ $f == "Message.rs" ]]; then + sed --in-place='' 's/List/\`List\`/g' $f echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${SPARSE_TENSOR_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f elif [[ $f == "SparseTensor.rs" ]]; then echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f diff --git a/arrow-ipc/src/gen/File.rs b/arrow-ipc/src/gen/File.rs index c0c2fb183237..427cf75de096 100644 --- a/arrow-ipc/src/gen/File.rs +++ b/arrow-ipc/src/gen/File.rs @@ -23,6 +23,8 @@ use flatbuffers::EndianScalar; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify +// @generated + // struct Block, aligned to 8 #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] @@ -64,6 +66,10 @@ impl<'b> flatbuffers::Push for Block { let src = ::core::slice::from_raw_parts(self as *const Block as *const u8, Self::size()); dst.copy_from_slice(src); } + #[inline] + fn alignment() -> flatbuffers::PushAlignment { + flatbuffers::PushAlignment::new(8) + } } impl<'a> flatbuffers::Verifiable for Block { @@ -211,8 +217,8 @@ impl<'a> Footer<'a> { Footer { _tab: table } } #[allow(unused_mut)] - pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( - _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, + pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr, A: flatbuffers::Allocator + 'bldr>( + _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr, A>, args: &'args FooterArgs<'args>, ) -> flatbuffers::WIPOffset> { let mut builder = FooterBuilder::new(_fbb); @@ -344,11 +350,11 @@ impl<'a> Default for FooterArgs<'a> { } } -pub struct FooterBuilder<'a: 'b, 'b> { - fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, +pub struct FooterBuilder<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> { + fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a, A>, start_: flatbuffers::WIPOffset, } -impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { +impl<'a: 'b, 'b, A: flatbuffers::Allocator + 'a> FooterBuilder<'a, 'b, A> { #[inline] pub fn add_version(&mut self, version: MetadataVersion) { self.fbb_ @@ -388,7 +394,7 @@ impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { ); } #[inline] - pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>) -> FooterBuilder<'a, 'b> { + pub fn new(_fbb: &'b mut flatbuffers::FlatBufferBuilder<'a, A>) -> FooterBuilder<'a, 'b, A> { let start = _fbb.start_table(); FooterBuilder { fbb_: _fbb, @@ -474,16 +480,16 @@ pub unsafe fn size_prefixed_root_as_footer_unchecked(buf: &[u8]) -> Footer { flatbuffers::size_prefixed_root_unchecked::