diff --git a/datafusion/common/src/types/builtin.rs b/datafusion/common/src/types/builtin.rs index ec69db7903779..314529b99a342 100644 --- a/datafusion/common/src/types/builtin.rs +++ b/datafusion/common/src/types/builtin.rs @@ -15,9 +15,17 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::IntervalUnit::*; + use crate::types::{LogicalTypeRef, NativeType}; use std::sync::{Arc, LazyLock}; +/// Create a singleton and accompanying static variable for a [`LogicalTypeRef`] +/// of a [`NativeType`]. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. macro_rules! singleton { ($name:ident, $getter:ident, $ty:ident) => { static $name: LazyLock = @@ -31,6 +39,26 @@ macro_rules! singleton { }; } +/// Similar to [`singleton`], but for native types that have variants, such as +/// `NativeType::Interval(MonthDayNano)`. +/// * `name`: name of the static variable, must be unique. +/// * `getter`: name of the public function that will return the singleton instance +/// of the static variable. +/// * `ty`: the [`NativeType`]. +/// * `variant`: specific variant of the `ty`. +macro_rules! singleton_variant { + ($name:ident, $getter:ident, $ty:ident, $variant:ident) => { + static $name: LazyLock = + LazyLock::new(|| Arc::new(NativeType::$ty($variant))); + + #[doc = "Getter for singleton instance of a logical type representing"] + #[doc = concat!("[`NativeType::", stringify!($ty), "`] of unit [`", stringify!($variant),"`].`")] + pub fn $getter() -> LogicalTypeRef { + Arc::clone(&$name) + } + }; +} + singleton!(LOGICAL_NULL, logical_null, Null); singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean); singleton!(LOGICAL_INT8, logical_int8, Int8); @@ -47,3 +75,10 @@ singleton!(LOGICAL_FLOAT64, logical_float64, Float64); singleton!(LOGICAL_DATE, logical_date, Date); singleton!(LOGICAL_BINARY, logical_binary, Binary); singleton!(LOGICAL_STRING, logical_string, String); + +singleton_variant!( + LOGICAL_INTERVAL_MDN, + logical_interval_mdn, + Interval, + MonthDayNano +); diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 5cef0adfbde80..8c41701ae5768 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -486,4 +486,9 @@ impl NativeType { pub fn is_binary(&self) -> bool { matches!(self, NativeType::Binary | NativeType::FixedSizeBinary(_)) } + + #[inline] + pub fn is_null(&self) -> bool { + matches!(self, NativeType::Null) + } } diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 5cb7a17ee3128..2bf7092dd2224 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -382,10 +382,7 @@ impl TypeSignatureClass { } /// Does the specified `NativeType` match this type signature class? - pub fn matches_native_type( - self: &TypeSignatureClass, - logical_type: &NativeType, - ) -> bool { + pub fn matches_native_type(&self, logical_type: &NativeType) -> bool { if logical_type == &NativeType::Null { return true; } @@ -431,6 +428,7 @@ impl TypeSignatureClass { TypeSignatureClass::Binary if native_type.is_binary() => { Ok(origin_type.to_owned()) } + _ if native_type.is_null() => Ok(origin_type.to_owned()), _ => internal_err!("May miss the matching logic in `matches_native_type`"), } } diff --git a/datafusion/functions-nested/src/range.rs b/datafusion/functions-nested/src/range.rs index 01c6e9c43f2ef..e570ecf97420f 100644 --- a/datafusion/functions-nested/src/range.rs +++ b/datafusion/functions-nested/src/range.rs @@ -18,33 +18,39 @@ //! [`ScalarUDFImpl`] definitions for range and gen_series functions. use crate::utils::make_scalar_function; -use arrow::array::{ - builder::{Date32Builder, TimestampNanosecondBuilder}, - temporal_conversions::as_datetime_with_timezone, - timezone::Tz, - types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, - Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, -}; use arrow::buffer::OffsetBuffer; -use arrow::datatypes::{ - DataType, DataType::*, Field, IntervalUnit::MonthDayNano, TimeUnit::Nanosecond, +use arrow::datatypes::TimeUnit; +use arrow::datatypes::{DataType, Field, IntervalUnit::MonthDayNano}; +use arrow::{ + array::{ + builder::{Date32Builder, TimestampNanosecondBuilder}, + temporal_conversions::as_datetime_with_timezone, + timezone::Tz, + types::{Date32Type, IntervalMonthDayNanoType, TimestampNanosecondType}, + Array, ArrayRef, Int64Array, ListArray, ListBuilder, NullBufferBuilder, + }, + compute::cast, }; +use datafusion_common::internal_err; use datafusion_common::{ cast::{ as_date32_array, as_int64_array, as_interval_mdn_array, as_timestamp_nanosecond_array, }, - DataFusionError, ScalarValue, + types::{ + logical_date, logical_int64, logical_interval_mdn, logical_string, NativeType, + }, + ScalarValue, }; use datafusion_common::{ exec_datafusion_err, exec_err, not_impl_datafusion_err, utils::take_function_args, Result, }; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, + TypeSignatureClass, Volatility, }; use datafusion_macros::user_doc; -use itertools::Itertools; use std::any::Any; use std::cmp::Ordering; use std::iter::from_fn; @@ -146,10 +152,52 @@ impl Default for Range { } impl Range { + fn defined_signature() -> Signature { + // We natively only support i64 in our implementation; so ensure we cast other integer + // types to it. + let integer = Coercion::new_implicit( + TypeSignatureClass::Native(logical_int64()), + vec![TypeSignatureClass::Integer], + NativeType::Int64, + ); + // We natively only support mdn in our implementation; so ensure we cast other interval + // types to it. + let interval = Coercion::new_implicit( + TypeSignatureClass::Native(logical_interval_mdn()), + vec![TypeSignatureClass::Interval], + NativeType::Interval(MonthDayNano), + ); + // Ideally we'd limit to only Date32 & Timestamp(Nanoseconds) as those are the implementations + // we have but that is difficult to do with this current API; we'll cast later on to + // handle such types. + let date = Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Date, + ); + let timestamp = Coercion::new_exact(TypeSignatureClass::Timestamp); + Signature::one_of( + vec![ + // Integer ranges + // Stop + TypeSignature::Coercible(vec![integer.clone()]), + // Start & stop + TypeSignature::Coercible(vec![integer.clone(), integer.clone()]), + // Start, stop & step + TypeSignature::Coercible(vec![integer.clone(), integer.clone(), integer]), + // Date range + TypeSignature::Coercible(vec![date.clone(), date, interval.clone()]), + // Timestamp range + TypeSignature::Coercible(vec![timestamp.clone(), timestamp, interval]), + ], + Volatility::Immutable, + ) + } + /// Generate `range()` function which excludes upper bound. pub fn new() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Self::defined_signature(), include_upper_bound: false, } } @@ -157,7 +205,7 @@ impl Range { /// Generate `generate_series()` function which includes upper bound. fn generate_series() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Self::defined_signature(), include_upper_bound: true, } } @@ -180,39 +228,27 @@ impl ScalarUDFImpl for Range { &self.signature } - fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - arg_types - .iter() - .map(|arg_type| match arg_type { - Null => Ok(Null), - Int8 => Ok(Int64), - Int16 => Ok(Int64), - Int32 => Ok(Int64), - Int64 => Ok(Int64), - UInt8 => Ok(Int64), - UInt16 => Ok(Int64), - UInt32 => Ok(Int64), - UInt64 => Ok(Int64), - Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())), - Date32 => Ok(Date32), - Date64 => Ok(Date32), - Utf8 => Ok(Date32), - LargeUtf8 => Ok(Date32), - Utf8View => Ok(Date32), - Interval(_) => Ok(Interval(MonthDayNano)), - _ => exec_err!("Unsupported DataType"), - }) - .try_collect() - } - fn return_type(&self, arg_types: &[DataType]) -> Result { if arg_types.iter().any(|t| t.is_null()) { - Ok(Null) - } else { - Ok(List(Arc::new(Field::new_list_field( + return Ok(DataType::Null); + } + + match (&arg_types[0], arg_types.get(1)) { + // In implementation we downcast to Date32 so ensure reflect that here + (_, Some(DataType::Date64)) | (DataType::Date64, _) => Ok(DataType::List( + Arc::new(Field::new_list_field(DataType::Date32, true)), + )), + // Ensure we preserve timezone + (DataType::Timestamp(_, tz), _) => { + Ok(DataType::List(Arc::new(Field::new_list_field( + DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()), + true, + )))) + } + _ => Ok(DataType::List(Arc::new(Field::new_list_field( arg_types[0].clone(), true, - )))) + )))), } } @@ -226,13 +262,20 @@ impl ScalarUDFImpl for Range { return Ok(ColumnarValue::Scalar(ScalarValue::Null)); } match args[0].data_type() { - Int64 => make_scalar_function(|args| self.gen_range_inner(args))(args), - Date32 => make_scalar_function(|args| self.gen_range_date(args))(args), - Timestamp(_, _) => { + DataType::Int64 => { + make_scalar_function(|args| self.gen_range_inner(args))(args) + } + DataType::Date32 | DataType::Date64 => { + make_scalar_function(|args| self.gen_range_date(args))(args) + } + DataType::Timestamp(_, _) => { make_scalar_function(|args| self.gen_range_timestamp(args))(args) } dt => { - exec_err!("unsupported type for {}. Expected Int64, Date32 or Timestamp, got: {dt}", self.name()) + internal_err!( + "Signature failed to guard unknown input type for {}: {dt}", + self.name() + ) } } } @@ -274,7 +317,7 @@ impl Range { as_int64_array(stop_array)?, Some(as_int64_array(step_array)?), ), - _ => return exec_err!("{} expects 1 to 3 arguments", self.name()), + _ => return internal_err!("{} expects 1 to 3 arguments", self.name()), }; let mut values = vec![]; @@ -310,7 +353,7 @@ impl Range { }; } let arr = Arc::new(ListArray::try_new( - Arc::new(Field::new_list_field(Int64, true)), + Arc::new(Field::new_list_field(DataType::Int64, true)), OffsetBuffer::new(offsets.into()), Arc::new(Int64Array::from(values)), valid.finish(), @@ -320,29 +363,28 @@ impl Range { fn gen_range_date(&self, args: &[ArrayRef]) -> Result { let [start, stop, step] = take_function_args(self.name(), args)?; + let step = as_interval_mdn_array(step)?; - let (start_array, stop_array, step_array) = ( - as_date32_array(start)?, - as_date32_array(stop)?, - as_interval_mdn_array(step)?, - ); + // Signature can only guarantee we get a date type, not specifically + // date32 so handle potential cast from date64 here. + let start = cast(start, &DataType::Date32)?; + let start = as_date32_array(&start)?; + let stop = cast(stop, &DataType::Date32)?; + let stop = as_date32_array(&stop)?; // values are date32s let values_builder = Date32Builder::new(); let mut list_builder = ListBuilder::new(values_builder); - for idx in 0..stop_array.len() { - if start_array.is_null(idx) - || stop_array.is_null(idx) - || step_array.is_null(idx) - { + for idx in 0..stop.len() { + if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) { list_builder.append_null(); continue; } - let start = start_array.value(idx); - let stop = stop_array.value(idx); - let step = step_array.value(idx); + let start = start.value(idx); + let stop = stop.value(idx); + let step = step.value(idx); let (months, days, _) = IntervalMonthDayNanoType::to_parts(step); if months == 0 && days == 0 { @@ -378,44 +420,45 @@ impl Range { fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result { let [start, stop, step] = take_function_args(self.name(), args)?; + let step = as_interval_mdn_array(step)?; + + // Signature can only guarantee we get a timestamp type, not specifically + // timestamp(ns) so handle potential cast from other timestamps here. + fn cast_to_ns(arr: &ArrayRef) -> Result { + match arr.data_type() { + DataType::Timestamp(TimeUnit::Nanosecond, _) => Ok(Arc::clone(arr)), + DataType::Timestamp(_, tz) => Ok(cast( + arr, + &DataType::Timestamp(TimeUnit::Nanosecond, tz.to_owned()), + )?), + _ => unreachable!(), + } + } + let start = cast_to_ns(start)?; + let start = as_timestamp_nanosecond_array(&start)?; + let stop = cast_to_ns(stop)?; + let stop = as_timestamp_nanosecond_array(&stop)?; - // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz) - // TODO: remove these map_err once the signature is robust enough to guard against this - let start_arr = as_timestamp_nanosecond_array(start).map_err(|_e| { - DataFusionError::Internal(format!( - "Unexpected argument type for {} : {}", - self.name(), - start.data_type() - )) - })?; - let stop_arr = as_timestamp_nanosecond_array(stop).map_err(|_e| { - DataFusionError::Internal(format!( - "Unexpected argument type for {} : {}", - self.name(), - stop.data_type() - )) - })?; - let step_arr = as_interval_mdn_array(step)?; - let start_tz = parse_tz(&start_arr.timezone())?; - let stop_tz = parse_tz(&stop_arr.timezone())?; + let start_tz = parse_tz(&start.timezone())?; + let stop_tz = parse_tz(&stop.timezone())?; // values are timestamps - let values_builder = start_arr + let values_builder = start .timezone() .map_or_else(TimestampNanosecondBuilder::new, |start_tz_str| { TimestampNanosecondBuilder::new().with_timezone(start_tz_str) }); let mut list_builder = ListBuilder::new(values_builder); - for idx in 0..start_arr.len() { - if start_arr.is_null(idx) || stop_arr.is_null(idx) || step_arr.is_null(idx) { + for idx in 0..start.len() { + if start.is_null(idx) || stop.is_null(idx) || step.is_null(idx) { list_builder.append_null(); continue; } - let start = start_arr.value(idx); - let stop = stop_arr.value(idx); - let step = step_arr.value(idx); + let start = start.value(idx); + let stop = stop.value(idx); + let step = step.value(idx); let (months, days, ns) = IntervalMonthDayNanoType::to_parts(step); if months == 0 && days == 0 && ns == 0 { diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 144e3b757adf3..5c74f3ddc6134 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6949,6 +6949,23 @@ select range(5), ---- [0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] [] +# Ensure can coerce from other valid types +query ??????????? +select range(5), + range(2, 5), + range(2, 10, 3), + range(10, 2, -3), + range(arrow_cast(1, 'Int8'), 5, -1), + range(arrow_cast(1, 'Int16'), arrow_cast(-5, 'Int8'), 1), + range(arrow_cast(1, 'Int32'), arrow_cast(-5, 'Int16'), arrow_cast(-1, 'Int8')), + range(DATE '1992-09-01', DATE '1993-03-01', arrow_cast('1 MONTH', 'Interval(YearMonth)')), + range(DATE '1993-02-01', arrow_cast(DATE '1993-01-01', 'Date64'), INTERVAL '-1' DAY), + range(arrow_cast(DATE '1989-04-01', 'Date64'), DATE '1993-03-01', INTERVAL '1' YEAR), + range(arrow_cast(DATE '1993-03-01', 'Date64'), arrow_cast(DATE '1989-04-01', 'Date64'), INTERVAL '1' YEAR) +; +---- +[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8] [10, 7, 4] [] [] [1, 0, -1, -2, -3, -4] [1992-09-01, 1992-10-01, 1992-11-01, 1992-12-01, 1993-01-01, 1993-02-01] [1993-02-01, 1993-01-31, 1993-01-30, 1993-01-29, 1993-01-28, 1993-01-27, 1993-01-26, 1993-01-25, 1993-01-24, 1993-01-23, 1993-01-22, 1993-01-21, 1993-01-20, 1993-01-19, 1993-01-18, 1993-01-17, 1993-01-16, 1993-01-15, 1993-01-14, 1993-01-13, 1993-01-12, 1993-01-11, 1993-01-10, 1993-01-09, 1993-01-08, 1993-01-07, 1993-01-06, 1993-01-05, 1993-01-04, 1993-01-03, 1993-01-02] [1989-04-01, 1990-04-01, 1991-04-01] [] + # Test range with zero step query error DataFusion error: Execution error: step can't be 0 for function range\(start \[, stop, step\]\) select range(1, 1, 0); @@ -7114,6 +7131,17 @@ select generate_series('2021-01-01'::timestamp, '2021-01-01T15:00:00'::timestamp ---- [2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] +# Other timestamp types are coerced to nanosecond +query ? +select generate_series(arrow_cast('2021-01-01'::timestamp, 'Timestamp(Second, None)'), '2021-01-01T15:00:00'::timestamp, INTERVAL '1' HOUR); +---- +[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] + +query ? +select generate_series('2021-01-01'::timestamp, arrow_cast('2021-01-01T15:00:00'::timestamp, 'Timestamp(Microsecond, None)'), INTERVAL '1' HOUR); +---- +[2021-01-01T00:00:00, 2021-01-01T01:00:00, 2021-01-01T02:00:00, 2021-01-01T03:00:00, 2021-01-01T04:00:00, 2021-01-01T05:00:00, 2021-01-01T06:00:00, 2021-01-01T07:00:00, 2021-01-01T08:00:00, 2021-01-01T09:00:00, 2021-01-01T10:00:00, 2021-01-01T11:00:00, 2021-01-01T12:00:00, 2021-01-01T13:00:00, 2021-01-01T14:00:00, 2021-01-01T15:00:00] + query ? select generate_series('2021-01-01T00:00:00EST'::timestamp, '2021-01-01T15:00:00-12:00'::timestamp, INTERVAL '1' HOUR); ---- @@ -7131,9 +7159,18 @@ select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, [2021-01-01T00:00:00-05:00, 2021-01-01T01:29:54.500-05:00, 2021-01-01T02:59:49-05:00, 2021-01-01T04:29:43.500-05:00, 2021-01-01T05:59:38-05:00] ## mixing types for timestamps is not supported -query error DataFusion error: Internal error: Unexpected argument type for generate_series : Date32 +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature select generate_series(arrow_cast('2021-01-01T00:00:00', 'Timestamp(Nanosecond, Some("-05:00"))'), DATE '2021-01-02', INTERVAL '1' HOUR); +## mixing types not allowed even if an argument is null +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL); + +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series(1, '2024-01-01', '2025-01-02'); + +query error DataFusion error: Error during planning: Internal error: Function 'generate_series' failed to match any signature +select generate_series('2024-01-01'::timestamp, '2025-01-02', interval '1 day'); ## should return NULL query ? @@ -7152,11 +7189,6 @@ select generate_series(DATE '1992-09-01', DATE '1993-03-01', NULL); ---- NULL -query ? -select generate_series(TIMESTAMP '1992-09-01', DATE '1993-03-01', NULL); ----- -NULL - query ? select generate_series(NULL, DATE '1993-03-01', INTERVAL '1' YEAR); ----