From 0663077c6c86402062bef3c90dd6bf7e6f5d1390 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Wed, 27 Aug 2025 23:12:44 +0800 Subject: [PATCH 1/7] add strict mode to `cast_to_variant` and related functions Signed-off-by: codephage2020 --- .../src/cast_to_variant.rs | 250 ++++++++++++++---- parquet-variant-compute/src/lib.rs | 1 + .../src/type_conversion.rs | 35 +++ 3 files changed, 237 insertions(+), 49 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index abc9a863e1ea..95f3a1704aa6 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -51,7 +51,8 @@ fn convert_timestamp( time_zone: &Option>, input: &dyn Array, builder: &mut VariantArrayBuilder, -) { + strict: bool, +) -> Result<(), ArrowError> { let native_datetimes: Vec> = match time_unit { arrow_schema::TimeUnit::Second => { let ts_array = input @@ -59,10 +60,28 @@ fn convert_timestamp( .downcast_ref::() .expect("Array is not TimestampSecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_s_to_datetime(y).unwrap())) - .collect() + if strict { + let mut result = Vec::with_capacity(ts_array.len()); + for x in ts_array.iter() { + match x { + Some(y) => match timestamp_s_to_datetime(y) { + Some(dt) => result.push(Some(dt)), + None => { + return Err(ArrowError::ComputeError( + "Invalid timestamp seconds value".to_string(), + )) + } + }, + None => result.push(None), + } + } + result + } else { + ts_array + .iter() + .map(|x| x.and_then(timestamp_s_to_datetime)) + .collect() + } } arrow_schema::TimeUnit::Millisecond => { let ts_array = input @@ -70,48 +89,109 @@ fn convert_timestamp( .downcast_ref::() .expect("Array is not TimestampMillisecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_ms_to_datetime(y).unwrap())) - .collect() + if strict { + let mut result = Vec::with_capacity(ts_array.len()); + for x in ts_array.iter() { + match x { + Some(y) => match timestamp_ms_to_datetime(y) { + Some(dt) => result.push(Some(dt)), + None => { + return Err(ArrowError::ComputeError( + "Invalid timestamp milliseconds value".to_string(), + )) + } + }, + None => result.push(None), + } + } + result + } else { + ts_array + .iter() + .map(|x| x.and_then(timestamp_ms_to_datetime)) + .collect() + } } arrow_schema::TimeUnit::Microsecond => { let ts_array = input .as_any() .downcast_ref::() .expect("Array is not TimestampMicrosecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_us_to_datetime(y).unwrap())) - .collect() + if strict { + let mut result = Vec::with_capacity(ts_array.len()); + for x in ts_array.iter() { + match x { + Some(y) => match timestamp_us_to_datetime(y) { + Some(dt) => result.push(Some(dt)), + None => { + return Err(ArrowError::ComputeError( + "Invalid timestamp microseconds value".to_string(), + )) + } + }, + None => result.push(None), + } + } + result + } else { + ts_array + .iter() + .map(|x| x.and_then(timestamp_us_to_datetime)) + .collect() + } } arrow_schema::TimeUnit::Nanosecond => { let ts_array = input .as_any() .downcast_ref::() .expect("Array is not TimestampNanosecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_ns_to_datetime(y).unwrap())) - .collect() + if strict { + let mut result = Vec::with_capacity(ts_array.len()); + for x in ts_array.iter() { + match x { + Some(y) => match timestamp_ns_to_datetime(y) { + Some(dt) => result.push(Some(dt)), + None => { + return Err(ArrowError::ComputeError( + "Invalid timestamp nanoseconds value".to_string(), + )) + } + }, + None => result.push(None), + } + } + result + } else { + ts_array + .iter() + .map(|x| x.and_then(timestamp_ns_to_datetime)) + .collect() + } } }; - for x in native_datetimes { + for (i, x) in native_datetimes.iter().enumerate() { match x { Some(ndt) => { if time_zone.is_none() { - builder.append_variant(ndt.into()); + builder.append_variant((*ndt).into()); } else { - let utc_dt: DateTime = Utc.from_utc_datetime(&ndt); + let utc_dt: DateTime = Utc.from_utc_datetime(ndt); builder.append_variant(utc_dt.into()); } } None => { + if strict && input.is_valid(i) { + return Err(ArrowError::ComputeError(format!( + "Failed to convert timestamp at index {}: invalid timestamp value", + i + ))); + } builder.append_null(); } } } + Ok(()) } /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you @@ -143,7 +223,14 @@ fn convert_timestamp( /// `1970-01-01T00:00:01.234567890Z` /// will be truncated to /// `1970-01-01T00:00:01.234567Z` -pub fn cast_to_variant(input: &dyn Array) -> Result { +/// +/// # Arguments +/// * `input` - The array to convert to VariantArray +/// * `strict` - If true, return error on conversion failure. If false, insert null for failed conversions. +pub fn cast_to_variant_with_options( + input: &dyn Array, + strict: bool, +) -> Result { let mut builder = VariantArrayBuilder::new(input.len()); let input_type = input.data_type(); @@ -248,7 +335,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { } } DataType::Timestamp(time_unit, time_zone) => { - convert_timestamp(time_unit, time_zone, input, &mut builder); + convert_timestamp(time_unit, time_zone, input, &mut builder, strict)?; } DataType::Time32(unit) => { match *unit { @@ -257,10 +344,11 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { Time32SecondType, as_primitive, // nano second are always 0 - |v| NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0u32).unwrap(), + |v| NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0u32), input, - builder - ); + builder, + strict + )?; } TimeUnit::Millisecond => { generic_conversion_array!( @@ -269,11 +357,11 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { |v| NaiveTime::from_num_seconds_from_midnight_opt( v as u32 / 1000, (v as u32 % 1000) * 1_000_000 - ) - .unwrap(), + ), input, - builder - ); + builder, + strict + )?; } _ => { return Err(ArrowError::CastError(format!( @@ -292,11 +380,11 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { |v| NaiveTime::from_num_seconds_from_midnight_opt( (v / 1_000_000) as u32, (v % 1_000_000 * 1_000) as u32 - ) - .unwrap(), + ), input, - builder - ); + builder, + strict + )?; } TimeUnit::Nanosecond => { generic_conversion_array!( @@ -305,11 +393,11 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { |v| NaiveTime::from_num_seconds_from_midnight_opt( (v / 1_000_000_000) as u32, (v % 1_000_000_000) as u32 - ) - .unwrap(), + ), input, - builder - ); + builder, + strict + )?; } _ => { return Err(ArrowError::CastError(format!( @@ -396,10 +484,11 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { generic_conversion_array!( Date64Type, as_primitive, - |v: i64| { Date64Type::to_naive_date_opt(v).unwrap() }, + |v: i64| Date64Type::to_naive_date_opt(v), input, - builder - ); + builder, + strict + )?; } DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { DataType::Int16 => convert_run_end_encoded::(input, &mut builder)?, @@ -545,6 +634,14 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { Ok(builder.build()) } +/// Convert an array to a `VariantArray` with strict mode enabled (panics on conversion failures). +/// +/// This function provides backward compatibility. For non-panicking behavior, +/// use `cast_to_variant_with_options` with `strict = false`. +pub fn cast_to_variant(input: &dyn Array) -> Result { + cast_to_variant_with_options(input, true) +} + /// Convert union arrays fn convert_union( fields: &UnionFields, @@ -645,10 +742,6 @@ fn convert_dictionary_encoded( Ok(()) } -// TODO do we need a cast_with_options to allow specifying conversion behavior, -// e.g. how to handle overflows, whether to convert to Variant::Null or return -// an error, etc. ? - #[cfg(test)] mod tests { use super::*; @@ -661,8 +754,8 @@ mod tests { IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeListArray, LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, UnionArray, + Time64MicrosecondArray, Time64NanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, UnionArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano}; @@ -2376,9 +2469,8 @@ mod tests { /// Converts the given `Array` to a `VariantArray` and tests the conversion /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output. - fn run_test(values: ArrayRef, expected: Vec>) { - // test without nulls - let variant_array = cast_to_variant(&values).unwrap(); + fn run_test_with_options(values: ArrayRef, expected: Vec>, strict: bool) { + let variant_array = cast_to_variant_with_options(&values, strict).unwrap(); assert_eq!(variant_array.len(), expected.len()); for (i, expected_value) in expected.iter().enumerate() { match expected_value { @@ -2392,4 +2484,64 @@ mod tests { } } } + + fn run_test(values: ArrayRef, expected: Vec>) { + run_test_with_options(values, expected, true); + } + + fn run_test_non_strict(values: ArrayRef, expected: Vec>) { + run_test_with_options(values, expected, false); + } + + #[test] + fn test_cast_to_variant_non_strict_mode_date64() { + let date64_values = Date64Array::from(vec![Some(i64::MAX), Some(0), Some(i64::MIN)]); + + let values = Arc::new(date64_values); + run_test_non_strict( + values, + vec![ + None, + Some(Variant::Date(Date64Type::to_naive_date_opt(0).unwrap())), + None, + ], + ); + } + + #[test] + fn test_cast_to_variant_non_strict_mode_time32() { + let time32_array = Time32SecondArray::from(vec![Some(90000), Some(3600), Some(-1)]); + + let values = Arc::new(time32_array); + run_test_non_strict( + values, + vec![ + None, + Some(Variant::Time( + NaiveTime::from_num_seconds_from_midnight_opt(3600, 0).unwrap(), + )), + None, + ], + ); + } + + #[test] + fn test_cast_to_variant_non_strict_mode_timestamp() { + let ts_array = TimestampSecondArray::from(vec![Some(i64::MAX), Some(0), Some(1609459200)]) + .with_timezone_opt(None::<&str>); + + let values = Arc::new(ts_array); + run_test_non_strict( + values, + vec![ + None, // Invalid timestamp becomes null + Some(Variant::TimestampNtzMicros( + timestamp_s_to_datetime(0).unwrap(), + )), + Some(Variant::TimestampNtzMicros( + timestamp_s_to_datetime(1609459200).unwrap(), + )), + ], + ); + } } diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index ef674d9614b5..82b50a011d69 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,5 +46,6 @@ pub mod variant_get; pub use variant_array::{ShreddingState, VariantArray}; pub use variant_array_builder::{VariantArrayBuilder, VariantArrayVariantBuilder}; +pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; pub use from_json::json_to_variant; pub use to_json::variant_to_json; diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 647d2c705ff0..b508d982c9a1 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -20,6 +20,7 @@ /// Convert the input array to a `VariantArray` row by row, using `method` /// not requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant +/// If `strict` is true, return error on conversion failure. If false, insert null. macro_rules! non_generic_conversion_array { ($array:expr, $cast_fn:expr, $builder:expr) => {{ let array = $array; @@ -32,6 +33,31 @@ macro_rules! non_generic_conversion_array { $builder.append_variant(Variant::from(cast_value)); } }}; + ($array:expr, $cast_fn:expr, $builder:expr, $strict:expr) => {{ + let array = $array; + for i in 0..array.len() { + if array.is_null(i) { + $builder.append_null(); + continue; + } + match $cast_fn(array.value(i)) { + Some(cast_value) => { + $builder.append_variant(Variant::from(cast_value)); + } + None => { + if $strict { + return Err(ArrowError::ComputeError(format!( + "Failed to convert value at index {}: conversion failed", + i + ))); + } else { + $builder.append_null(); + } + } + } + } + Ok::<(), ArrowError>(()) + }}; } pub(crate) use non_generic_conversion_array; @@ -52,6 +78,7 @@ pub(crate) use non_generic_conversion_single_value; /// Convert the input array to a `VariantArray` row by row, using `method` /// requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant +/// If `strict` is true, return error on conversion failure. If false, insert null. macro_rules! generic_conversion_array { ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ $crate::type_conversion::non_generic_conversion_array!( @@ -60,6 +87,14 @@ macro_rules! generic_conversion_array { $builder ) }}; + ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr, $strict:expr) => {{ + $crate::type_conversion::non_generic_conversion_array!( + $input.$method::<$t>(), + $cast_fn, + $builder, + $strict + ) + }}; } pub(crate) use generic_conversion_array; From 7ff0b9d0cfff0c65f94b112700b62138e612c06d Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Wed, 27 Aug 2025 23:40:07 +0800 Subject: [PATCH 2/7] make doc happy Signed-off-by: codephage2020 --- parquet-variant-compute/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 82b50a011d69..2d5e0cb5a67a 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -22,7 +22,7 @@ //! - [`VariantArrayBuilder`]: For building [`VariantArray`] //! - [`json_to_variant`]: Function to convert a batch of JSON strings to a `VariantArray`. //! - [`variant_to_json`]: Function to convert a `VariantArray` to a batch of JSON strings. -//! - [`cast_to_variant`]: Module to cast other Arrow arrays to `VariantArray`. +//! - [`mod@cast_to_variant`]: Module to cast other Arrow arrays to `VariantArray`. //! - [`variant_get`]: Module to get values from a `VariantArray` using a specified [`VariantPath`] //! //! ## 🚧 Work In Progress From e2c455fbdd7998aaf267da922e9176a6f7f6ffaa Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Thu, 28 Aug 2025 21:54:01 +0800 Subject: [PATCH 3/7] update comments Signed-off-by: codephage2020 --- parquet-variant-compute/src/type_conversion.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index b508d982c9a1..d6d89684226c 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -20,7 +20,7 @@ /// Convert the input array to a `VariantArray` row by row, using `method` /// not requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant -/// If `strict` is true, return error on conversion failure. If false, insert null. +/// If `strict` is true(default), return error on conversion failure. If false, insert null. macro_rules! non_generic_conversion_array { ($array:expr, $cast_fn:expr, $builder:expr) => {{ let array = $array; @@ -78,7 +78,7 @@ pub(crate) use non_generic_conversion_single_value; /// Convert the input array to a `VariantArray` row by row, using `method` /// requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant -/// If `strict` is true, return error on conversion failure. If false, insert null. +/// If `strict` is true(default), return error on conversion failure. If false, insert null. macro_rules! generic_conversion_array { ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ $crate::type_conversion::non_generic_conversion_array!( From 3ef1e179757579382b65fe545cca415cac14a410 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Thu, 28 Aug 2025 22:31:31 +0800 Subject: [PATCH 4/7] refactor: eliminate duplication in timestamp conversion using macro Signed-off-by: codephage2020 --- .../src/cast_to_variant.rs | 115 ++++-------------- .../src/type_conversion.rs | 27 ++++ 2 files changed, 49 insertions(+), 93 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 95f3a1704aa6..ea5379941c0b 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use crate::type_conversion::{ decimal_to_variant_decimal, generic_conversion_array, non_generic_conversion_array, - primitive_conversion_array, + primitive_conversion_array, timestamp_to_variant_timestamp, }; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ @@ -46,7 +46,7 @@ use parquet_variant::{ Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; -fn convert_timestamp( +fn convert_timestamp_with_options( time_unit: &TimeUnit, time_zone: &Option>, input: &dyn Array, @@ -59,114 +59,43 @@ fn convert_timestamp( .as_any() .downcast_ref::() .expect("Array is not TimestampSecondArray"); - - if strict { - let mut result = Vec::with_capacity(ts_array.len()); - for x in ts_array.iter() { - match x { - Some(y) => match timestamp_s_to_datetime(y) { - Some(dt) => result.push(Some(dt)), - None => { - return Err(ArrowError::ComputeError( - "Invalid timestamp seconds value".to_string(), - )) - } - }, - None => result.push(None), - } - } - result - } else { - ts_array - .iter() - .map(|x| x.and_then(timestamp_s_to_datetime)) - .collect() - } + timestamp_to_variant_timestamp!(ts_array, timestamp_s_to_datetime, "seconds", strict) } arrow_schema::TimeUnit::Millisecond => { let ts_array = input .as_any() .downcast_ref::() .expect("Array is not TimestampMillisecondArray"); - - if strict { - let mut result = Vec::with_capacity(ts_array.len()); - for x in ts_array.iter() { - match x { - Some(y) => match timestamp_ms_to_datetime(y) { - Some(dt) => result.push(Some(dt)), - None => { - return Err(ArrowError::ComputeError( - "Invalid timestamp milliseconds value".to_string(), - )) - } - }, - None => result.push(None), - } - } - result - } else { - ts_array - .iter() - .map(|x| x.and_then(timestamp_ms_to_datetime)) - .collect() - } + timestamp_to_variant_timestamp!( + ts_array, + timestamp_ms_to_datetime, + "milliseconds", + strict + ) } arrow_schema::TimeUnit::Microsecond => { let ts_array = input .as_any() .downcast_ref::() .expect("Array is not TimestampMicrosecondArray"); - if strict { - let mut result = Vec::with_capacity(ts_array.len()); - for x in ts_array.iter() { - match x { - Some(y) => match timestamp_us_to_datetime(y) { - Some(dt) => result.push(Some(dt)), - None => { - return Err(ArrowError::ComputeError( - "Invalid timestamp microseconds value".to_string(), - )) - } - }, - None => result.push(None), - } - } - result - } else { - ts_array - .iter() - .map(|x| x.and_then(timestamp_us_to_datetime)) - .collect() - } + timestamp_to_variant_timestamp!( + ts_array, + timestamp_us_to_datetime, + "microseconds", + strict + ) } arrow_schema::TimeUnit::Nanosecond => { let ts_array = input .as_any() .downcast_ref::() .expect("Array is not TimestampNanosecondArray"); - if strict { - let mut result = Vec::with_capacity(ts_array.len()); - for x in ts_array.iter() { - match x { - Some(y) => match timestamp_ns_to_datetime(y) { - Some(dt) => result.push(Some(dt)), - None => { - return Err(ArrowError::ComputeError( - "Invalid timestamp nanoseconds value".to_string(), - )) - } - }, - None => result.push(None), - } - } - result - } else { - ts_array - .iter() - .map(|x| x.and_then(timestamp_ns_to_datetime)) - .collect() - } + timestamp_to_variant_timestamp!( + ts_array, + timestamp_ns_to_datetime, + "nanoseconds", + strict + ) } }; @@ -335,7 +264,7 @@ pub fn cast_to_variant_with_options( } } DataType::Timestamp(time_unit, time_zone) => { - convert_timestamp(time_unit, time_zone, input, &mut builder, strict)?; + convert_timestamp_with_options(time_unit, time_zone, input, &mut builder, strict)?; } DataType::Time32(unit) => { match *unit { diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index d6d89684226c..1cfb9230e313 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -158,3 +158,30 @@ macro_rules! decimal_to_variant_decimal { }}; } pub(crate) use decimal_to_variant_decimal; + +/// Convert a timestamp value to a `VariantTimestamp` +macro_rules! timestamp_to_variant_timestamp { + ($ts_array:expr, $converter:expr, $unit_name:expr, $strict:expr) => { + if $strict { + let mut result = Vec::with_capacity($ts_array.len()); + for x in $ts_array.iter() { + match x { + Some(y) => match $converter(y) { + Some(dt) => result.push(Some(dt)), + None => { + return Err(ArrowError::ComputeError(format!( + "Invalid timestamp {} value", + $unit_name + ))) + } + }, + None => result.push(None), + } + } + result + } else { + $ts_array.iter().map(|x| x.and_then($converter)).collect() + } + }; +} +pub(crate) use timestamp_to_variant_timestamp; From 097d4dc1ef63d09bf6571f8cac7c4cc8aa8f5232 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Sat, 30 Aug 2025 00:11:41 +0800 Subject: [PATCH 5/7] Simplify error handling in cast_to_variant and type_conversion modules Signed-off-by: codephage2020 Co-authored-by: Ryan Johnson --- .../src/cast_to_variant.rs | 16 ++++----- .../src/type_conversion.rs | 36 ++++++------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index ea5379941c0b..03141ca4f673 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -109,13 +109,13 @@ fn convert_timestamp_with_options( builder.append_variant(utc_dt.into()); } } + None if strict && input.is_valid(i) => { + return Err(ArrowError::ComputeError(format!( + "Failed to convert timestamp at index {}: invalid timestamp value", + i + ))); + } None => { - if strict && input.is_valid(i) { - return Err(ArrowError::ComputeError(format!( - "Failed to convert timestamp at index {}: invalid timestamp value", - i - ))); - } builder.append_null(); } } @@ -563,9 +563,9 @@ pub fn cast_to_variant_with_options( Ok(builder.build()) } -/// Convert an array to a `VariantArray` with strict mode enabled (panics on conversion failures). +/// Convert an array to a `VariantArray` with strict mode enabled (returns errors on conversion failures). /// -/// This function provides backward compatibility. For non-panicking behavior, +/// This function provides backward compatibility. For non-strict behavior, /// use `cast_to_variant_with_options` with `strict = false`. pub fn cast_to_variant(input: &dyn Array) -> Result { cast_to_variant_with_options(input, true) diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 1cfb9230e313..aa60b425a18b 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -44,16 +44,13 @@ macro_rules! non_generic_conversion_array { Some(cast_value) => { $builder.append_variant(Variant::from(cast_value)); } - None => { - if $strict { - return Err(ArrowError::ComputeError(format!( - "Failed to convert value at index {}: conversion failed", - i - ))); - } else { - $builder.append_null(); - } + None if $strict => { + return Err(ArrowError::ComputeError(format!( + "Failed to convert value at index {}: conversion failed", + i + ))); } + None => $builder.append_null(), } } Ok::<(), ArrowError>(()) @@ -163,22 +160,11 @@ pub(crate) use decimal_to_variant_decimal; macro_rules! timestamp_to_variant_timestamp { ($ts_array:expr, $converter:expr, $unit_name:expr, $strict:expr) => { if $strict { - let mut result = Vec::with_capacity($ts_array.len()); - for x in $ts_array.iter() { - match x { - Some(y) => match $converter(y) { - Some(dt) => result.push(Some(dt)), - None => { - return Err(ArrowError::ComputeError(format!( - "Invalid timestamp {} value", - $unit_name - ))) - } - }, - None => result.push(None), - } - } - result + let error = + || ArrowError::ComputeError(format!("Invalid timestamp {} value", $unit_name)); + let converter = |x| $converter(x).ok_or_else(error); + let iter = $ts_array.iter().map(|x| x.map(converter).transpose()); + iter.collect::, ArrowError>>()? } else { $ts_array.iter().map(|x| x.and_then($converter)).collect() } From 3c52e6dd0acd7a1f542d26d3df5869f7a946c46a Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Mon, 1 Sep 2025 22:34:07 +0800 Subject: [PATCH 6/7] Introduce CastOptions struct for configurable casting behavior Signed-off-by: codephage2020 --- .../src/cast_to_variant.rs | 49 ++++++++++++------- parquet-variant-compute/src/lib.rs | 2 +- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 03141ca4f673..745a58400bb1 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -46,12 +46,26 @@ use parquet_variant::{ Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +/// Options for controlling the behavior of `cast_to_variant_with_options`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CastOptions { + /// If true, return error on conversion failure. If false, insert null for failed conversions. + pub strict: bool, +} + +impl Default for CastOptions { + fn default() -> Self { + Self { strict: true } + } +} + + fn convert_timestamp_with_options( time_unit: &TimeUnit, time_zone: &Option>, input: &dyn Array, builder: &mut VariantArrayBuilder, - strict: bool, + options: &CastOptions, ) -> Result<(), ArrowError> { let native_datetimes: Vec> = match time_unit { arrow_schema::TimeUnit::Second => { @@ -59,7 +73,7 @@ fn convert_timestamp_with_options( .as_any() .downcast_ref::() .expect("Array is not TimestampSecondArray"); - timestamp_to_variant_timestamp!(ts_array, timestamp_s_to_datetime, "seconds", strict) + timestamp_to_variant_timestamp!(ts_array, timestamp_s_to_datetime, "seconds", options.strict) } arrow_schema::TimeUnit::Millisecond => { let ts_array = input @@ -70,7 +84,7 @@ fn convert_timestamp_with_options( ts_array, timestamp_ms_to_datetime, "milliseconds", - strict + options.strict ) } arrow_schema::TimeUnit::Microsecond => { @@ -82,7 +96,7 @@ fn convert_timestamp_with_options( ts_array, timestamp_us_to_datetime, "microseconds", - strict + options.strict ) } arrow_schema::TimeUnit::Nanosecond => { @@ -94,7 +108,7 @@ fn convert_timestamp_with_options( ts_array, timestamp_ns_to_datetime, "nanoseconds", - strict + options.strict ) } }; @@ -109,7 +123,7 @@ fn convert_timestamp_with_options( builder.append_variant(utc_dt.into()); } } - None if strict && input.is_valid(i) => { + None if options.strict && input.is_valid(i) => { return Err(ArrowError::ComputeError(format!( "Failed to convert timestamp at index {}: invalid timestamp value", i @@ -155,10 +169,10 @@ fn convert_timestamp_with_options( /// /// # Arguments /// * `input` - The array to convert to VariantArray -/// * `strict` - If true, return error on conversion failure. If false, insert null for failed conversions. +/// * `options` - Options controlling conversion behavior pub fn cast_to_variant_with_options( input: &dyn Array, - strict: bool, + options: &CastOptions, ) -> Result { let mut builder = VariantArrayBuilder::new(input.len()); @@ -264,7 +278,7 @@ pub fn cast_to_variant_with_options( } } DataType::Timestamp(time_unit, time_zone) => { - convert_timestamp_with_options(time_unit, time_zone, input, &mut builder, strict)?; + convert_timestamp_with_options(time_unit, time_zone, input, &mut builder, options)?; } DataType::Time32(unit) => { match *unit { @@ -276,7 +290,7 @@ pub fn cast_to_variant_with_options( |v| NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0u32), input, builder, - strict + options.strict )?; } TimeUnit::Millisecond => { @@ -289,7 +303,7 @@ pub fn cast_to_variant_with_options( ), input, builder, - strict + options.strict )?; } _ => { @@ -312,7 +326,7 @@ pub fn cast_to_variant_with_options( ), input, builder, - strict + options.strict )?; } TimeUnit::Nanosecond => { @@ -325,7 +339,7 @@ pub fn cast_to_variant_with_options( ), input, builder, - strict + options.strict )?; } _ => { @@ -416,7 +430,7 @@ pub fn cast_to_variant_with_options( |v: i64| Date64Type::to_naive_date_opt(v), input, builder, - strict + options.strict )?; } DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { @@ -566,9 +580,9 @@ pub fn cast_to_variant_with_options( /// Convert an array to a `VariantArray` with strict mode enabled (returns errors on conversion failures). /// /// This function provides backward compatibility. For non-strict behavior, -/// use `cast_to_variant_with_options` with `strict = false`. +/// use `cast_to_variant_with_options` with `CastOptions { strict: false }`. pub fn cast_to_variant(input: &dyn Array) -> Result { - cast_to_variant_with_options(input, true) + cast_to_variant_with_options(input, &CastOptions::default()) } /// Convert union arrays @@ -2399,7 +2413,8 @@ mod tests { /// against the expected values. It also tests the handling of nulls by /// setting one element to null and verifying the output. fn run_test_with_options(values: ArrayRef, expected: Vec>, strict: bool) { - let variant_array = cast_to_variant_with_options(&values, strict).unwrap(); + let options = CastOptions { strict }; + let variant_array = cast_to_variant_with_options(&values, &options).unwrap(); assert_eq!(variant_array.len(), expected.len()); for (i, expected_value) in expected.iter().enumerate() { match expected_value { diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index 2d5e0cb5a67a..3c928636ac34 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,6 +46,6 @@ pub mod variant_get; pub use variant_array::{ShreddingState, VariantArray}; pub use variant_array_builder::{VariantArrayBuilder, VariantArrayVariantBuilder}; -pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; +pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options, CastOptions}; pub use from_json::json_to_variant; pub use to_json::variant_to_json; From a136b09dcd8c08b8c6d61e75ccbdf3436b031c90 Mon Sep 17 00:00:00 2001 From: codephage2020 Date: Mon, 1 Sep 2025 22:37:21 +0800 Subject: [PATCH 7/7] fmt Signed-off-by: codephage2020 --- parquet-variant-compute/src/cast_to_variant.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 745a58400bb1..79d503f0f6a3 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -59,7 +59,6 @@ impl Default for CastOptions { } } - fn convert_timestamp_with_options( time_unit: &TimeUnit, time_zone: &Option>, @@ -73,7 +72,12 @@ fn convert_timestamp_with_options( .as_any() .downcast_ref::() .expect("Array is not TimestampSecondArray"); - timestamp_to_variant_timestamp!(ts_array, timestamp_s_to_datetime, "seconds", options.strict) + timestamp_to_variant_timestamp!( + ts_array, + timestamp_s_to_datetime, + "seconds", + options.strict + ) } arrow_schema::TimeUnit::Millisecond => { let ts_array = input