diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 4bb4fb3e79b6..f0e7de056ea2 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4229,6 +4229,48 @@ mod tests { } } + #[test] + fn test_cast_string_with_large_date_to_date32() { + let array = Arc::new(StringArray::from(vec![ + Some("+10999-12-31"), + Some("-0010-02-28"), + Some("0010-02-28"), + Some("0000-01-01"), + Some("-0000-01-01"), + Some("-0001-01-01"), + ])) as ArrayRef; + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let b = cast_with_options(&array, &to_type, &options).unwrap(); + let c = b.as_primitive::(); + assert_eq!(3298139, c.value(0)); // 10999-12-31 + assert_eq!(-723122, c.value(1)); // -0010-02-28 + assert_eq!(-715817, c.value(2)); // 0010-02-28 + assert_eq!(c.value(3), c.value(4)); // Expect 0000-01-01 and -0000-01-01 to be parsed the same + assert_eq!(-719528, c.value(3)); // 0000-01-01 + assert_eq!(-719528, c.value(4)); // -0000-01-01 + assert_eq!(-719893, c.value(5)); // -0001-01-01 + } + + #[test] + fn test_cast_invalid_string_with_large_date_to_date32() { + // Large dates need to be prefixed with a + or - sign, otherwise they are not parsed correctly + let array = Arc::new(StringArray::from(vec![Some("10999-12-31")])) as ArrayRef; + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let err = cast_with_options(&array, &to_type, &options).unwrap_err(); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string '10999-12-31' to value of Date32 type" + ); + } + #[test] fn test_cast_string_format_yyyymmdd_to_date32() { let a0 = Arc::new(StringViewArray::from(vec![ diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 4e93e9787cc8..55834ad92a01 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -595,6 +595,32 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163; const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; fn parse_date(string: &str) -> Option { + // If the date has an extended (signed) year such as "+10999-12-31" or "-0012-05-06" + // + // According to [ISO 8601], years have: + // Four digits or more for the year. Years in the range 0000 to 9999 will be pre-padded by + // zero to ensure four digits. Years outside that range will have a prefixed positive or negative symbol. + // + // [ISO 8601]: https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE + if string.starts_with('+') || string.starts_with('-') { + // Skip the sign and look for the hyphen that terminates the year digits. + // According to ISO 8601 the unsigned part must be at least 4 digits. + let rest = &string[1..]; + let hyphen = rest.find('-')?; + if hyphen < 4 { + return None; + } + // The year substring is the sign and the digits (but not the separator) + // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999" + let year: i32 = string[..hyphen + 1].parse().ok()?; + // The remainder should begin with a '-' which we strip off, leaving the month-day part. + let remainder = string[hyphen + 1..].strip_prefix('-')?; + let mut parts = remainder.splitn(2, '-'); + let month: u32 = parts.next()?.parse().ok()?; + let day: u32 = parts.next()?.parse().ok()?; + return NaiveDate::from_ymd_opt(year, month, day); + } + if string.len() > 10 { // Try to parse as datetime and return just the date part return string_to_datetime(&Utc, string)