Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue with "to_date" failing to process dates later than year 2262 #12227

Merged
merged 11 commits into from
Sep 6, 2024
Merged
25 changes: 25 additions & 0 deletions datafusion/functions/src/datetime/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,31 @@ pub(crate) fn string_to_timestamp_nanos_formatted(
})
}

/// Accepts a string with a `chrono` format and converts it to a
/// millisecond precision timestamp.
///
/// See [`chrono::format::strftime`] for the full set of supported formats.
///
/// Internally, this function uses the `chrono` library for the
/// datetime parsing
///
/// ## Timezone / Offset Handling
///
/// Numerical values of timestamps are stored compared to offset UTC.
///
/// Any timestamp in the formatting string is handled according to the rules
/// defined by `chrono`.
///
/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
///
#[inline]
pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result<i64> {
Ok(string_to_datetime_formatted(&Utc, s, format)?
.naive_utc()
.and_utc()
.timestamp_millis())
}

pub(crate) fn handle<'a, O, F, S>(
args: &'a [ColumnarValue],
op: F,
Expand Down
233 changes: 221 additions & 12 deletions datafusion/functions/src/datetime/to_date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

use std::any::Any;

use arrow::array::types::Date32Type;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::Date32;
use arrow::error::ArrowError::ParseError;
use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser};

use crate::datetime::common::*;
use datafusion_common::{exec_err, internal_datafusion_err, Result};
use datafusion_common::error::DataFusionError;
use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};

#[derive(Debug)]
Expand All @@ -47,22 +49,20 @@ impl ToDateFunc {
match args.len() {
1 => handle::<Date32Type, _, Date32Type>(
args,
|s| {
string_to_timestamp_nanos_shim(s)
.map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000))
.and_then(|v| {
v.try_into().map_err(|_| {
internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed")
})
})
|s| match Date32Type::parse(s) {
Some(v) => Ok(v),
MartinKolbAtWork marked this conversation as resolved.
Show resolved Hide resolved
None => arrow_err!(ParseError(
"Unable to cast to Date32 for converting from i64 to i32 failed"
.to_string()
)),
},
"to_date",
),
2.. => handle_multiple::<Date32Type, _, Date32Type, _>(
args,
|s, format| {
string_to_timestamp_nanos_formatted(s, format)
.map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000))
string_to_timestamp_millis_formatted(s, format)
.map(|n| n / (24 * 60 * 60 * 1_000))
.and_then(|v| {
v.try_into().map_err(|_| {
internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed")
Expand Down Expand Up @@ -118,3 +118,212 @@ impl ScalarUDFImpl for ToDateFunc {
}
}
}

#[cfg(test)]
mod tests {
use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type};
use datafusion_common::ScalarValue;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use super::ToDateFunc;

#[test]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😍

Very nice

fn test_to_date_without_format() {
struct TestCase {
name: &'static str,
date_str: &'static str,
}

let test_cases = vec![
TestCase {
name: "Largest four-digit year (9999)",
date_str: "9999-12-31",
},
TestCase {
name: "Year 1 (0001)",
date_str: "0001-12-31",
},
TestCase {
name: "Year before epoch (1969)",
date_str: "1969-01-01",
},
TestCase {
name: "Switch Julian/Gregorian calendar (1582-10-10)",
date_str: "1582-10-10",
},
];

for tc in &test_cases {
let date_scalar = ScalarValue::Utf8(Some(tc.date_str.to_string()));
let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
assert_eq!(
date_val, expected,
"{}: to_date created wrong value",
tc.name
);
}
_ => panic!("Could not convert '{}' to Date", tc.date_str),
}
}
}

#[test]
fn test_to_date_with_format() {
struct TestCase {
name: &'static str,
date_str: &'static str,
format_str: &'static str,
formatted_date: &'static str,
}

let test_cases = vec![
TestCase {
name: "Largest four-digit year (9999)",
date_str: "9999-12-31",
format_str: "%Y%m%d",
formatted_date: "99991231",
},
TestCase {
name: "Smallest four-digit year (-9999)",
date_str: "-9999-12-31",
format_str: "%Y/%m/%d",
formatted_date: "-9999/12/31",
},
TestCase {
name: "Year 1 (0001)",
date_str: "0001-12-31",
format_str: "%Y%m%d",
formatted_date: "00011231",
},
TestCase {
name: "Year before epoch (1969)",
date_str: "1969-01-01",
format_str: "%Y%m%d",
formatted_date: "19690101",
},
TestCase {
name: "Switch Julian/Gregorian calendar (1582-10-10)",
date_str: "1582-10-10",
format_str: "%Y%m%d",
formatted_date: "15821010",
},
TestCase {
name: "Negative Year, BC (-42-01-01)",
date_str: "-42-01-01",
format_str: "%Y/%m/%d",
formatted_date: "-42/01/01",
},
];

for tc in &test_cases {
let formatted_date_scalar =
ScalarValue::Utf8(Some(tc.formatted_date.to_string()));
let format_scalar = ScalarValue::Utf8(Some(tc.format_str.to_string()));

let to_date_result = ToDateFunc::new().invoke(&[
ColumnarValue::Scalar(formatted_date_scalar),
ColumnarValue::Scalar(format_scalar),
]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
assert_eq!(date_val, expected, "{}: to_date created wrong value for date '{}' with format string '{}'", tc.name, tc.formatted_date, tc.format_str);
}
_ => panic!(
"Could not convert '{}' with format string '{}'to Date",
tc.date_str, tc.format_str
),
}
}
}

#[test]
fn test_to_date_multiple_format_strings() {
let formatted_date_scalar = ScalarValue::Utf8(Some("2023/01/31".into()));
let format1_scalar = ScalarValue::Utf8(Some("%Y-%m-%d".into()));
let format2_scalar = ScalarValue::Utf8(Some("%Y/%m/%d".into()));

let to_date_result = ToDateFunc::new().invoke(&[
ColumnarValue::Scalar(formatted_date_scalar),
ColumnarValue::Scalar(format1_scalar),
ColumnarValue::Scalar(format2_scalar),
]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted("2023-01-31", "%Y-%m-%d");
assert_eq!(
date_val, expected,
"to_date created wrong value for date with 2 format strings"
);
}
_ => panic!("Conversion failed",),
}
}

#[test]
fn test_to_date_from_timestamp() {
let test_cases = vec![
"2020-09-08T13:42:29Z",
"2020-09-08T13:42:29.190855-05:00",
"2020-09-08 12:13:29",
];
for date_str in test_cases {
let formatted_date_scalar = ScalarValue::Utf8(Some(date_str.into()));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I think you can write the same thing more concisely like

Suggested change
let formatted_date_scalar = ScalarValue::Utf8(Some(date_str.into()));
let formatted_date_scalar = ScalarValue::from(date_str);


let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(formatted_date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
Comment on lines +280 to +284
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The machinery to call the function is also a but clunky, I wonder if it would make the tests easier to read (as there would be less conversion machinery) if we added a function like

/// Invokes ToDate on the input string, and returns the resulting Date32 value
fn run_to_date(input: &str) -> Result<i32> {
..
}

let expected = Date32Type::parse_formatted("2020-09-08", "%Y-%m-%d");
assert_eq!(date_val, expected, "to_date created wrong value");
}
_ => panic!("Conversion of {} failed", date_str),
}
}
}

#[test]
fn test_to_date_string_with_valid_number() {
let date_str = "20241231";
let date_scalar = ScalarValue::Utf8(Some(date_str.into()));

let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted("2024-12-31", "%Y-%m-%d");
assert_eq!(
date_val, expected,
"to_date created wrong value for {}",
date_str
);
}
_ => panic!("Conversion of {} failed", date_str),
}
}

#[test]
fn test_to_date_string_with_invalid_number() {
let date_str = "202412311";
let date_scalar = ScalarValue::Utf8(Some(date_str.into()));

let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

if let Ok(ColumnarValue::Scalar(ScalarValue::Date32(_))) = to_date_result {
panic!(
"Conversion of {} succeded, but should have failed, ",
date_str
);
}
}
}
10 changes: 9 additions & 1 deletion datafusion/sqllogictest/test_files/dates.slt
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ SELECT to_date(ts / 100000000) FROM to_date_t1 LIMIT 3
2003-11-02
2003-11-29

# verify date with time zone, where the time zone date is already the next day, but result date in UTC is day before
query D
SELECT to_date('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
----
Expand All @@ -137,8 +138,15 @@ select to_date(arrow_cast(123, 'Int64'))
----
1970-05-04

statement error DataFusion error: Arrow error:
# Parse sequence of digits which yield a valid date, e.g. "21311111" would be "2131-11-11"
query D
SELECT to_date('21311111');
----
2131-11-11

# Parse sequence of digits which do not make up a valid date
statement error DataFusion error: Arrow error:
SELECT to_date('213111111');

# verify date cast with integer input
query DDDDDD
Expand Down
4 changes: 1 addition & 3 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1735,9 +1735,7 @@ Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format]s are
Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
Returns the corresponding date.

Note: `to_date` returns Date32. The supported range for integer input is between `-96465293` and `95026237`.
Supported range for string input is between `1677-09-21` and `2262-04-11` exclusive. To parse dates outside of
that range use a [Chrono format].
Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`.

```
to_date(expression[, ..., format_n])
Expand Down