Skip to content

Commit

Permalink
Fix issue with "to_date" failing to process dates later than year 2262 (
Browse files Browse the repository at this point in the history
#12227)

* Fix to_date: Support date values greater than year 2262

Due to using nanoseconds as an intermediate state, the values for dates processed via "to_date" cannot be later than the year 2262. The Arrow datatype for Date32 and Date64 supports much larger values.

The statements in some areas in the code that the usage of nanoseconds is imposed by Arrow is simply wrong. The Date32 type stores the number of days since epoch. The Date64 type stores the milliseconds (NOT nanoseconds) since epoch. Both Date32 and Date64 can therefore massively exceed the year 2262.
See: https://arrow.apache.org/docs/cpp/api/datatype.html

* Improved checks in unit tests

* to_date: Enable handling of nagative years and add more tests

* to_date: Fix test failures and update doc in MarkDown

* to_date: Revert special handling for negative years, focus on years up to 9999

* to_date: Fix clippy warnings

* Move string_to_timestamp_millis_formatted to common.rs

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
MartinKolbAtWork and alamb committed Sep 6, 2024
1 parent 7ead2ad commit 1a44ad2
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 16 deletions.
25 changes: 25 additions & 0 deletions datafusion/functions/src/datetime/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,31 @@ pub(crate) fn string_to_timestamp_nanos_formatted(
})
}

/// Accepts a string with a `chrono` format and converts it to a
/// millisecond precision timestamp.
///
/// See [`chrono::format::strftime`] for the full set of supported formats.
///
/// Internally, this function uses the `chrono` library for the
/// datetime parsing
///
/// ## Timezone / Offset Handling
///
/// Numerical values of timestamps are stored compared to offset UTC.
///
/// Any timestamp in the formatting string is handled according to the rules
/// defined by `chrono`.
///
/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html
///
#[inline]
pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result<i64> {
Ok(string_to_datetime_formatted(&Utc, s, format)?
.naive_utc()
.and_utc()
.timestamp_millis())
}

pub(crate) fn handle<'a, O, F, S>(
args: &'a [ColumnarValue],
op: F,
Expand Down
233 changes: 221 additions & 12 deletions datafusion/functions/src/datetime/to_date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

use std::any::Any;

use arrow::array::types::Date32Type;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::Date32;
use arrow::error::ArrowError::ParseError;
use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser};

use crate::datetime::common::*;
use datafusion_common::{exec_err, internal_datafusion_err, Result};
use datafusion_common::error::DataFusionError;
use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};

#[derive(Debug)]
Expand All @@ -47,22 +49,20 @@ impl ToDateFunc {
match args.len() {
1 => handle::<Date32Type, _, Date32Type>(
args,
|s| {
string_to_timestamp_nanos_shim(s)
.map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000))
.and_then(|v| {
v.try_into().map_err(|_| {
internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed")
})
})
|s| match Date32Type::parse(s) {
Some(v) => Ok(v),
None => arrow_err!(ParseError(
"Unable to cast to Date32 for converting from i64 to i32 failed"
.to_string()
)),
},
"to_date",
),
2.. => handle_multiple::<Date32Type, _, Date32Type, _>(
args,
|s, format| {
string_to_timestamp_nanos_formatted(s, format)
.map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000))
string_to_timestamp_millis_formatted(s, format)
.map(|n| n / (24 * 60 * 60 * 1_000))
.and_then(|v| {
v.try_into().map_err(|_| {
internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed")
Expand Down Expand Up @@ -118,3 +118,212 @@ impl ScalarUDFImpl for ToDateFunc {
}
}
}

#[cfg(test)]
mod tests {
use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type};
use datafusion_common::ScalarValue;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use super::ToDateFunc;

#[test]
fn test_to_date_without_format() {
struct TestCase {
name: &'static str,
date_str: &'static str,
}

let test_cases = vec![
TestCase {
name: "Largest four-digit year (9999)",
date_str: "9999-12-31",
},
TestCase {
name: "Year 1 (0001)",
date_str: "0001-12-31",
},
TestCase {
name: "Year before epoch (1969)",
date_str: "1969-01-01",
},
TestCase {
name: "Switch Julian/Gregorian calendar (1582-10-10)",
date_str: "1582-10-10",
},
];

for tc in &test_cases {
let date_scalar = ScalarValue::Utf8(Some(tc.date_str.to_string()));
let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
assert_eq!(
date_val, expected,
"{}: to_date created wrong value",
tc.name
);
}
_ => panic!("Could not convert '{}' to Date", tc.date_str),
}
}
}

#[test]
fn test_to_date_with_format() {
struct TestCase {
name: &'static str,
date_str: &'static str,
format_str: &'static str,
formatted_date: &'static str,
}

let test_cases = vec![
TestCase {
name: "Largest four-digit year (9999)",
date_str: "9999-12-31",
format_str: "%Y%m%d",
formatted_date: "99991231",
},
TestCase {
name: "Smallest four-digit year (-9999)",
date_str: "-9999-12-31",
format_str: "%Y/%m/%d",
formatted_date: "-9999/12/31",
},
TestCase {
name: "Year 1 (0001)",
date_str: "0001-12-31",
format_str: "%Y%m%d",
formatted_date: "00011231",
},
TestCase {
name: "Year before epoch (1969)",
date_str: "1969-01-01",
format_str: "%Y%m%d",
formatted_date: "19690101",
},
TestCase {
name: "Switch Julian/Gregorian calendar (1582-10-10)",
date_str: "1582-10-10",
format_str: "%Y%m%d",
formatted_date: "15821010",
},
TestCase {
name: "Negative Year, BC (-42-01-01)",
date_str: "-42-01-01",
format_str: "%Y/%m/%d",
formatted_date: "-42/01/01",
},
];

for tc in &test_cases {
let formatted_date_scalar =
ScalarValue::Utf8(Some(tc.formatted_date.to_string()));
let format_scalar = ScalarValue::Utf8(Some(tc.format_str.to_string()));

let to_date_result = ToDateFunc::new().invoke(&[
ColumnarValue::Scalar(formatted_date_scalar),
ColumnarValue::Scalar(format_scalar),
]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d");
assert_eq!(date_val, expected, "{}: to_date created wrong value for date '{}' with format string '{}'", tc.name, tc.formatted_date, tc.format_str);
}
_ => panic!(
"Could not convert '{}' with format string '{}'to Date",
tc.date_str, tc.format_str
),
}
}
}

#[test]
fn test_to_date_multiple_format_strings() {
let formatted_date_scalar = ScalarValue::Utf8(Some("2023/01/31".into()));
let format1_scalar = ScalarValue::Utf8(Some("%Y-%m-%d".into()));
let format2_scalar = ScalarValue::Utf8(Some("%Y/%m/%d".into()));

let to_date_result = ToDateFunc::new().invoke(&[
ColumnarValue::Scalar(formatted_date_scalar),
ColumnarValue::Scalar(format1_scalar),
ColumnarValue::Scalar(format2_scalar),
]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted("2023-01-31", "%Y-%m-%d");
assert_eq!(
date_val, expected,
"to_date created wrong value for date with 2 format strings"
);
}
_ => panic!("Conversion failed",),
}
}

#[test]
fn test_to_date_from_timestamp() {
let test_cases = vec![
"2020-09-08T13:42:29Z",
"2020-09-08T13:42:29.190855-05:00",
"2020-09-08 12:13:29",
];
for date_str in test_cases {
let formatted_date_scalar = ScalarValue::Utf8(Some(date_str.into()));

let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(formatted_date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted("2020-09-08", "%Y-%m-%d");
assert_eq!(date_val, expected, "to_date created wrong value");
}
_ => panic!("Conversion of {} failed", date_str),
}
}
}

#[test]
fn test_to_date_string_with_valid_number() {
let date_str = "20241231";
let date_scalar = ScalarValue::Utf8(Some(date_str.into()));

let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

match to_date_result {
Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => {
let expected = Date32Type::parse_formatted("2024-12-31", "%Y-%m-%d");
assert_eq!(
date_val, expected,
"to_date created wrong value for {}",
date_str
);
}
_ => panic!("Conversion of {} failed", date_str),
}
}

#[test]
fn test_to_date_string_with_invalid_number() {
let date_str = "202412311";
let date_scalar = ScalarValue::Utf8(Some(date_str.into()));

let to_date_result =
ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]);

if let Ok(ColumnarValue::Scalar(ScalarValue::Date32(_))) = to_date_result {
panic!(
"Conversion of {} succeded, but should have failed, ",
date_str
);
}
}
}
10 changes: 9 additions & 1 deletion datafusion/sqllogictest/test_files/dates.slt
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ SELECT to_date(ts / 100000000) FROM to_date_t1 LIMIT 3
2003-11-02
2003-11-29

# verify date with time zone, where the time zone date is already the next day, but result date in UTC is day before
query D
SELECT to_date('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z');
----
Expand All @@ -137,8 +138,15 @@ select to_date(arrow_cast(123, 'Int64'))
----
1970-05-04

statement error DataFusion error: Arrow error:
# Parse sequence of digits which yield a valid date, e.g. "21311111" would be "2131-11-11"
query D
SELECT to_date('21311111');
----
2131-11-11

# Parse sequence of digits which do not make up a valid date
statement error DataFusion error: Arrow error:
SELECT to_date('213111111');

# verify date cast with integer input
query DDDDDD
Expand Down
4 changes: 1 addition & 3 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1735,9 +1735,7 @@ Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format]s are
Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`).
Returns the corresponding date.

Note: `to_date` returns Date32. The supported range for integer input is between `-96465293` and `95026237`.
Supported range for string input is between `1677-09-21` and `2262-04-11` exclusive. To parse dates outside of
that range use a [Chrono format].
Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`.

```
to_date(expression[, ..., format_n])
Expand Down

0 comments on commit 1a44ad2

Please sign in to comment.