From 06cb704a434bfed321c5372ae5afdd4784ce59fa Mon Sep 17 00:00:00 2001 From: hanbings Date: Mon, 24 Feb 2025 16:08:53 +0800 Subject: [PATCH 1/7] feat: add format_timestamp function processes iso8601 format times --- crates/core/src/table/mod.rs | 57 +++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 93e4f8d1..799784ab 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -105,6 +105,7 @@ use crate::Result; use crate::config::read::HudiReadConfig; use arrow::record_batch::RecordBatch; use arrow_schema::{Field, Schema}; +use chrono::DateTime; use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::sync::Arc; @@ -445,8 +446,11 @@ impl Table { return Ok(Vec::new()); }; + let start_timestamp = format_timestamp(start_timestamp); + let end_timestamp = format_timestamp(end_timestamp); + let file_slices = self - .get_file_slices_between_internal(start_timestamp, end_timestamp) + .get_file_slices_between_internal(&start_timestamp, &end_timestamp) .await?; let fg_reader = self.create_file_group_reader_with_options([ @@ -474,6 +478,37 @@ impl Table { } } +/// Formats a timestamp string to `yyyyMMddHHmmSSSSS` format. +/// +/// The function attempts to parse the input timestamp string (`timestamp`) into a `DateTime` object. +/// It first tries parsing the ISO 8601 format, and if that fails, it tries two other formats: `yyyyMMddHHmmSSSSS` +/// and `yyyyMMddHHmmSS`. If none of the formats match, it returns the original input string. +/// +/// # Arguments +/// - `timestamp`: The input timestamp string to be formatted. +/// +/// # Returns +/// A string formatted as `yyyyMMddHHmmSSSSS`. If the input cannot be parsed, the original string is returned. +/// +fn format_timestamp(timestamp: &str) -> String { + if let Ok(datetime) = DateTime::parse_from_rfc3339(timestamp) { + return datetime.format("%Y%m%d%H%M%S%3f").to_string(); + } + + let formats = ["yyyyMMddHHmmSSSSS", "yyyyMMddHHmmSS"]; + for format in formats.iter() { + if let Ok(datetime) = DateTime::parse_from_str(timestamp, format) { + return datetime.format("%Y%m%d%H%M%S%3f").to_string(); + } + } + + if timestamp.len() == 10 && timestamp.chars().all(|c| c.is_digit(10) || c == '-') { + return format!("{}000000000", timestamp.replace("-", "")); + } + + timestamp.to_string() +} + #[cfg(test)] mod tests { use super::*; @@ -1357,4 +1392,24 @@ mod tests { Ok(()) } } + + #[test] + fn test_format_timestamp() { + let timestamps = vec![ + ("2019-01-23T12:34:56.123456789+00:00", "20190123123456123"), + ("2019-01-23T12:34:56.123456+00:00", "20190123123456123"), + ("2019-01-23T12:34:56.123+00:00", "20190123123456123"), + ("2019-01-23T12:34:56+00:00", "20190123123456000"), + ("2019-01-23T12:34:56Z", "20190123123456000"), + ("2019-01-23", "20190123000000000"), + ("20190123123456", "20190123123456"), + ("20190123000000000", "20190123000000000"), + ("20190123123456000", "20190123123456000"), + ("20190123123456123", "20190123123456123"), + ]; + + for (timestamp, result) in timestamps { + assert_eq!(crate::table::format_timestamp(timestamp), result) + } + } } From ed5ca09b1e0d8cdda05bf90c1baba5930724da24 Mon Sep 17 00:00:00 2001 From: hanbings Date: Mon, 24 Feb 2025 17:31:10 +0800 Subject: [PATCH 2/7] feat: override to the *as_of() method --- crates/core/src/table/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 799784ab..e954dff1 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -241,8 +241,9 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result>> { + let timestamp = format_timestamp(timestamp); let filters = from_str_tuples(filters)?; - self.get_file_slices_splits_internal(n, timestamp, &filters) + self.get_file_slices_splits_internal(n, ×tamp, &filters) .await } @@ -295,8 +296,9 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result> { + let timestamp = format_timestamp(timestamp); let filters = from_str_tuples(filters)?; - self.get_file_slices_internal(timestamp, &filters).await + self.get_file_slices_internal(×tamp, &filters).await } async fn get_file_slices_internal( @@ -406,8 +408,9 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result> { + let timestamp = format_timestamp(timestamp); let filters = from_str_tuples(filters)?; - self.read_snapshot_internal(timestamp, &filters).await + self.read_snapshot_internal(×tamp, &filters).await } async fn read_snapshot_internal( @@ -489,7 +492,6 @@ impl Table { /// /// # Returns /// A string formatted as `yyyyMMddHHmmSSSSS`. If the input cannot be parsed, the original string is returned. -/// fn format_timestamp(timestamp: &str) -> String { if let Ok(datetime) = DateTime::parse_from_rfc3339(timestamp) { return datetime.format("%Y%m%d%H%M%S%3f").to_string(); From bf33059d9d0ea0b29e345f8f5f6921db9a4f785a Mon Sep 17 00:00:00 2001 From: hanbings Date: Mon, 24 Feb 2025 17:31:54 +0800 Subject: [PATCH 3/7] fix: run cargo fmt --- crates/core/src/table/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index e954dff1..a508bbf8 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -496,7 +496,7 @@ fn format_timestamp(timestamp: &str) -> String { if let Ok(datetime) = DateTime::parse_from_rfc3339(timestamp) { return datetime.format("%Y%m%d%H%M%S%3f").to_string(); } - + let formats = ["yyyyMMddHHmmSSSSS", "yyyyMMddHHmmSS"]; for format in formats.iter() { if let Ok(datetime) = DateTime::parse_from_str(timestamp, format) { @@ -507,7 +507,7 @@ fn format_timestamp(timestamp: &str) -> String { if timestamp.len() == 10 && timestamp.chars().all(|c| c.is_digit(10) || c == '-') { return format!("{}000000000", timestamp.replace("-", "")); } - + timestamp.to_string() } From 2dbd96f45a1c058ac284940ce6697cd6f5bd0157 Mon Sep 17 00:00:00 2001 From: hanbings Date: Sat, 19 Apr 2025 18:17:15 +0800 Subject: [PATCH 4/7] [feat] add support for local time. --- crates/core/src/table/mod.rs | 60 +++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 5b45f045..90c08f92 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -93,6 +93,7 @@ pub mod partition; use crate::config::table::HudiTableConfig::PartitionFields; use crate::config::table::{HudiTableConfig, TableTypeValue}; use crate::config::HudiConfigs; +use crate::error::CoreError; use crate::expr::filter::{from_str_tuples, Filter}; use crate::file_group::file_slice::FileSlice; use crate::file_group::reader::FileGroupReader; @@ -105,7 +106,7 @@ use crate::Result; use crate::config::read::HudiReadConfig; use arrow::record_batch::RecordBatch; use arrow_schema::{Field, Schema}; -use chrono::DateTime; +use chrono::{DateTime, Local, TimeZone, Utc}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use url::Url; @@ -194,6 +195,10 @@ impl Table { .to::() } + pub fn is_utc_timezone(&self) -> bool { + self.timezone() == "UTC" + } + /// Get the latest Avro schema string of the table. pub async fn get_avro_schema(&self) -> Result { self.timeline.get_latest_avro_schema().await @@ -260,7 +265,7 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result>> { - let timestamp = format_timestamp(timestamp); + let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; let filters = from_str_tuples(filters)?; self.get_file_slices_splits_internal(n, ×tamp, &filters) .await @@ -315,7 +320,7 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result> { - let timestamp = format_timestamp(timestamp); + let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; let filters = from_str_tuples(filters)?; self.get_file_slices_internal(×tamp, &filters).await } @@ -428,7 +433,7 @@ impl Table { timestamp: &str, filters: &[(&str, &str, &str)], ) -> Result> { - let timestamp = format_timestamp(timestamp); + let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; let filters = from_str_tuples(filters)?; self.read_snapshot_internal(×tamp, &filters).await } @@ -469,8 +474,8 @@ impl Table { return Ok(Vec::new()); }; - let start_timestamp = format_timestamp(start_timestamp); - let end_timestamp = format_timestamp(end_timestamp); + let start_timestamp = format_timestamp(start_timestamp, self.is_utc_timezone())?; + let end_timestamp = format_timestamp(end_timestamp, self.is_utc_timezone())?; let file_slices = self .get_file_slices_between_internal(&start_timestamp, &end_timestamp) @@ -511,24 +516,46 @@ impl Table { /// - `timestamp`: The input timestamp string to be formatted. /// /// # Returns -/// A string formatted as `yyyyMMddHHmmSSSSS`. If the input cannot be parsed, the original string is returned. -fn format_timestamp(timestamp: &str) -> String { +/// A string formatted as `yyyyMMddHHmmSSSSS`. +fn format_timestamp(timestamp: &str, is_utc: bool) -> Result { + fn to_output(dt: DateTime, is_utc: bool) -> String { + match is_utc { + true => dt.with_timezone(&Utc).format("%Y%m%d%H%M%S%3f").to_string(), + false => dt + .with_timezone(&Local) + .format("%Y%m%d%H%M%S%3f") + .to_string(), + } + } + if let Ok(datetime) = DateTime::parse_from_rfc3339(timestamp) { - return datetime.format("%Y%m%d%H%M%S%3f").to_string(); + return Ok(to_output(datetime, is_utc)); } - let formats = ["yyyyMMddHHmmSSSSS", "yyyyMMddHHmmSS"]; + let formats = ["%Y%m%d%H%M%S%3f", "%Y%m%d%H%M%S"]; for format in formats.iter() { if let Ok(datetime) = DateTime::parse_from_str(timestamp, format) { - return datetime.format("%Y%m%d%H%M%S%3f").to_string(); + return Ok(to_output(datetime, is_utc)); } } - if timestamp.len() == 10 && timestamp.chars().all(|c| c.is_digit(10) || c == '-') { - return format!("{}000000000", timestamp.replace("-", "")); + // for a format like 2019-01-23. + if timestamp.len() == 10 && timestamp.chars().all(|c| c.is_ascii_digit() || c == '-') { + return Ok(format!("{}000000000", timestamp.replace("-", ""))); } - timestamp.to_string() + // for a format like 20190123000000 or 20190123000000000 + if timestamp.chars().all(|c| c.is_ascii_digit()) { + match timestamp.len() { + 14 | 17 => return Ok(timestamp.to_string()), + _ => {} + } + } + + Err(CoreError::Timeline(format!( + "Failed to parse timestamp: {}", + timestamp + ))) } #[cfg(test)] @@ -1478,7 +1505,10 @@ mod tests { ]; for (timestamp, result) in timestamps { - assert_eq!(crate::table::format_timestamp(timestamp), result) + assert_eq!( + crate::table::format_timestamp(timestamp, true).unwrap(), + result + ) } } } From 51b63dc0deda1b2f24ed32af0feb4fd86bdb4b7d Mon Sep 17 00:00:00 2001 From: Shiyan Xu <2701446+xushiyan@users.noreply.github.com> Date: Thu, 29 May 2025 02:29:28 -0500 Subject: [PATCH 5/7] improve parsing logic --- crates/core/src/error.rs | 3 + crates/core/src/table/mod.rs | 96 ++------------ crates/core/src/timeline/mod.rs | 1 + crates/core/src/timeline/util.rs | 208 +++++++++++++++++++++++++++++++ 4 files changed, 220 insertions(+), 88 deletions(-) create mode 100644 crates/core/src/timeline/util.rs diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs index b5f19f23..9d4f25bc 100644 --- a/crates/core/src/error.rs +++ b/crates/core/src/error.rs @@ -72,6 +72,9 @@ pub enum CoreError { #[error("Timeline error: {0}")] Timeline(String), + #[error("{0}")] + TimestampParsingError(String), + #[error("{0}")] Unsupported(String), diff --git a/crates/core/src/table/mod.rs b/crates/core/src/table/mod.rs index 9082b238..f1619e98 100644 --- a/crates/core/src/table/mod.rs +++ b/crates/core/src/table/mod.rs @@ -92,23 +92,21 @@ mod fs_view; mod listing; pub mod partition; +use crate::config::read::HudiReadConfig; use crate::config::table::HudiTableConfig::PartitionFields; use crate::config::table::{HudiTableConfig, TableTypeValue}; use crate::config::HudiConfigs; -use crate::error::CoreError; use crate::expr::filter::{from_str_tuples, Filter}; use crate::file_group::file_slice::FileSlice; use crate::file_group::reader::FileGroupReader; use crate::table::builder::TableBuilder; use crate::table::fs_view::FileSystemView; use crate::table::partition::PartitionPruner; +use crate::timeline::util::format_timestamp; use crate::timeline::{Timeline, EARLIEST_START_TIMESTAMP}; use crate::Result; - -use crate::config::read::HudiReadConfig; use arrow::record_batch::RecordBatch; use arrow_schema::{Field, Schema}; -use chrono::{DateTime, Local, TimeZone, Utc}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use url::Url; @@ -218,10 +216,6 @@ impl Table { .to::() } - pub fn is_utc_timezone(&self) -> bool { - self.timezone() == "UTC" - } - /// Get the latest Avro schema string of the table. pub async fn get_avro_schema(&self) -> Result { self.timeline.get_latest_avro_schema().await @@ -336,7 +330,7 @@ impl Table { I: IntoIterator, S: AsRef, { - let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; + let timestamp = format_timestamp(timestamp, &self.timezone())?; let filters = from_str_tuples(filters)?; self.get_file_slices_splits_internal(n, ×tamp, &filters) .await @@ -431,7 +425,7 @@ impl Table { I: IntoIterator, S: AsRef, { - let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; + let timestamp = format_timestamp(timestamp, &self.timezone())?; let filters = from_str_tuples(filters)?; self.get_file_slices_internal(×tamp, &filters).await } @@ -595,7 +589,7 @@ impl Table { I: IntoIterator, S: AsRef, { - let timestamp = format_timestamp(timestamp, self.is_utc_timezone())?; + let timestamp = format_timestamp(timestamp, &self.timezone())?; let filters = from_str_tuples(filters)?; self.read_snapshot_internal(×tamp, &filters).await } @@ -652,8 +646,9 @@ impl Table { return Ok(Vec::new()); }; - let start_timestamp = format_timestamp(start_timestamp, self.is_utc_timezone())?; - let end_timestamp = format_timestamp(end_timestamp, self.is_utc_timezone())?; + let timezone = self.timezone(); + let start_timestamp = format_timestamp(start_timestamp, &timezone)?; + let end_timestamp = format_timestamp(end_timestamp, &timezone)?; let file_slices = self .get_file_slices_between_internal(&start_timestamp, &end_timestamp) @@ -699,58 +694,6 @@ impl Table { } } -/// Formats a timestamp string to `yyyyMMddHHmmSSSSS` format. -/// -/// The function attempts to parse the input timestamp string (`timestamp`) into a `DateTime` object. -/// It first tries parsing the ISO 8601 format, and if that fails, it tries two other formats: `yyyyMMddHHmmSSSSS` -/// and `yyyyMMddHHmmSS`. If none of the formats match, it returns the original input string. -/// -/// # Arguments -/// - `timestamp`: The input timestamp string to be formatted. -/// -/// # Returns -/// A string formatted as `yyyyMMddHHmmSSSSS`. -fn format_timestamp(timestamp: &str, is_utc: bool) -> Result { - fn to_output(dt: DateTime, is_utc: bool) -> String { - match is_utc { - true => dt.with_timezone(&Utc).format("%Y%m%d%H%M%S%3f").to_string(), - false => dt - .with_timezone(&Local) - .format("%Y%m%d%H%M%S%3f") - .to_string(), - } - } - - if let Ok(datetime) = DateTime::parse_from_rfc3339(timestamp) { - return Ok(to_output(datetime, is_utc)); - } - - let formats = ["%Y%m%d%H%M%S%3f", "%Y%m%d%H%M%S"]; - for format in formats.iter() { - if let Ok(datetime) = DateTime::parse_from_str(timestamp, format) { - return Ok(to_output(datetime, is_utc)); - } - } - - // for a format like 2019-01-23. - if timestamp.len() == 10 && timestamp.chars().all(|c| c.is_ascii_digit() || c == '-') { - return Ok(format!("{}000000000", timestamp.replace("-", ""))); - } - - // for a format like 20190123000000 or 20190123000000000 - if timestamp.chars().all(|c| c.is_ascii_digit()) { - match timestamp.len() { - 14 | 17 => return Ok(timestamp.to_string()), - _ => {} - } - } - - Err(CoreError::Timeline(format!( - "Failed to parse timestamp: {}", - timestamp - ))) -} - #[cfg(test)] mod tests { use super::*; @@ -1663,27 +1606,4 @@ mod tests { Ok(()) } } - - #[test] - fn test_format_timestamp() { - let timestamps = vec![ - ("2019-01-23T12:34:56.123456789+00:00", "20190123123456123"), - ("2019-01-23T12:34:56.123456+00:00", "20190123123456123"), - ("2019-01-23T12:34:56.123+00:00", "20190123123456123"), - ("2019-01-23T12:34:56+00:00", "20190123123456000"), - ("2019-01-23T12:34:56Z", "20190123123456000"), - ("2019-01-23", "20190123000000000"), - ("20190123123456", "20190123123456"), - ("20190123000000000", "20190123000000000"), - ("20190123123456000", "20190123123456000"), - ("20190123123456123", "20190123123456123"), - ]; - - for (timestamp, result) in timestamps { - assert_eq!( - crate::table::format_timestamp(timestamp, true).unwrap(), - result - ) - } - } } diff --git a/crates/core/src/timeline/mod.rs b/crates/core/src/timeline/mod.rs index ac78903c..5a9bbdb6 100644 --- a/crates/core/src/timeline/mod.rs +++ b/crates/core/src/timeline/mod.rs @@ -18,6 +18,7 @@ */ pub mod instant; pub(crate) mod selector; +pub(crate) mod util; use crate::config::HudiConfigs; use crate::error::CoreError; diff --git a/crates/core/src/timeline/util.rs b/crates/core/src/timeline/util.rs new file mode 100644 index 00000000..28fcb5ef --- /dev/null +++ b/crates/core/src/timeline/util.rs @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use crate::config::table::TimelineTimezoneValue; +use crate::error::CoreError; +use crate::Result; +use chrono::{DateTime, Datelike, Local, NaiveDate, NaiveDateTime, TimeZone, Timelike, Utc}; +use std::str::FromStr; + +/// Parse various timestamp formats and convert to Hudi timeline format +/// +/// Supported formats: +/// - Hudi timeline format: `yyyyMMddHHmmSSSSS` or `yyyyMMddHHmmSS` +/// - Epoch time (seconds, milliseconds, microseconds, nanoseconds) +/// - ISO8601 formats with various precisions +/// - Date-only format: `YYYY-MM-DD` +/// +/// # Arguments +/// * `ts_str` - The timestamp str to parse +/// * `timezone` - The timezone config value to interpret the timestamp in (UTC or Local) +pub fn format_timestamp(ts_str: &str, timezone: &str) -> Result { + let ts_str = ts_str.trim(); + + if let Ok(formatted_ts) = parse_hudi_timeline_format(ts_str) { + return Ok(formatted_ts); + } + + let timezone = TimelineTimezoneValue::from_str(timezone)?; + + if let Ok(formatted_ts) = parse_epoch_time(ts_str, &timezone) { + return Ok(formatted_ts); + } + + if let Ok(formatted_ts) = parse_iso8601_format(ts_str, &timezone) { + return Ok(formatted_ts); + } + + if let Ok(formatted_ts) = parse_date_only_format(ts_str, &timezone) { + return Ok(formatted_ts); + } + + Err(CoreError::TimestampParsingError(format!( + "Unable to parse timestamp: {}", + ts_str + ))) +} + +/// Parse existing Hudi timeline format and ensure it's properly formatted +fn parse_hudi_timeline_format(ts_str: &str) -> Result { + if ts_str.len() == 14 && ts_str.chars().all(|c| c.is_ascii_digit()) { + return Ok(format!("{}000", ts_str)); + } else if ts_str.len() == 17 && ts_str.chars().all(|c| c.is_ascii_digit()) { + return Ok(ts_str.to_string()); + } + + Err(CoreError::TimestampParsingError( + "Not a Hudi timeline format".to_string(), + )) +} + +/// Parse epoch time in various precisions +fn parse_epoch_time(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { + let ts: i64 = ts_str + .parse() + .map_err(|_| CoreError::TimestampParsingError(format!("Invalid epoch time: {}", ts_str)))?; + + let datetime = if ts_str.len() <= 10 { + // Seconds + DateTime::from_timestamp(ts, 0) + } else if ts_str.len() <= 13 { + // Milliseconds + DateTime::from_timestamp(ts / 1000, ((ts % 1000) * 1_000_000) as u32) + } else if ts_str.len() <= 16 { + // Microseconds + DateTime::from_timestamp(ts / 1_000_000, ((ts % 1_000_000) * 1000) as u32) + } else { + // Nanoseconds + DateTime::from_timestamp(ts / 1_000_000_000, (ts % 1_000_000_000) as u32) + }; + + match datetime { + Some(dt) => match timezone { + TimelineTimezoneValue::UTC => Ok(datetime_to_timeline_format(&dt)), + TimelineTimezoneValue::Local => { + let local_dt = dt.with_timezone(&Local); + Ok(datetime_to_timeline_format(&local_dt)) + } + }, + None => Err(CoreError::TimestampParsingError(format!( + "Invalid epoch time: {}", + ts_str + ))), + } +} + +/// Parse ISO8601 format timestamps +fn parse_iso8601_format(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { + // List of ISO8601 formats to try, ordered from the most specific to the least specific + let formats = [ + // With timezone offset + "%Y-%m-%dT%H:%M:%S%.9f%:z", // nanoseconds with timezone + "%Y-%m-%dT%H:%M:%S%.6f%:z", // microseconds with timezone + "%Y-%m-%dT%H:%M:%S%.3f%:z", // milliseconds with timezone + "%Y-%m-%dT%H:%M:%S%:z", // seconds with timezone + // With Z (UTC) + "%Y-%m-%dT%H:%M:%S%.9fZ", // nanoseconds with Z + "%Y-%m-%dT%H:%M:%S%.6fZ", // microseconds with Z + "%Y-%m-%dT%H:%M:%S%.3fZ", // milliseconds with Z + "%Y-%m-%dT%H:%M:%SZ", // seconds with Z + // Without timezone (ambiguous - use mode to determine interpretation) + "%Y-%m-%dT%H:%M:%S%.9f", // nanoseconds + "%Y-%m-%dT%H:%M:%S%.6f", // microseconds + "%Y-%m-%dT%H:%M:%S%.3f", // milliseconds + "%Y-%m-%dT%H:%M:%S", // seconds + ]; + + // Try parsing with timezone-aware formats first (these ignore the timezone config) + for format in &formats[..8] { + if let Ok(dt) = DateTime::parse_from_str(ts_str, format) { + return match timezone { + TimelineTimezoneValue::UTC => { + Ok(datetime_to_timeline_format(&dt.with_timezone(&Utc))) + } + TimelineTimezoneValue::Local => { + let local_dt = dt.with_timezone(&Local); + Ok(datetime_to_timeline_format(&local_dt)) + } + }; + } + } + + // Try parsing as naive datetime and interpret it according to the timezone config + for format in &formats[8..] { + if let Ok(naive_dt) = NaiveDateTime::parse_from_str(ts_str, format) { + return format_naive_date_time(naive_dt, timezone); + } + } + + Err(CoreError::TimestampParsingError( + "Not a valid ISO8601 format".to_string(), + )) +} + +/// Parse date-only format (YYYY-MM-DD) +fn parse_date_only_format(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { + if let Ok(date) = NaiveDate::parse_from_str(ts_str, "%Y-%m-%d") { + // Convert to datetime at start of day + let datetime = date + .and_hms_opt(0, 0, 0) + .ok_or_else(|| CoreError::TimestampParsingError("Invalid date".to_string()))?; + + return format_naive_date_time(datetime, timezone); + } + + Err(CoreError::TimestampParsingError( + "Not a valid date format".to_string(), + )) +} + +fn format_naive_date_time( + date_time: NaiveDateTime, + timezone: &TimelineTimezoneValue, +) -> Result { + match timezone { + TimelineTimezoneValue::UTC => { + let dt = Utc.from_utc_datetime(&date_time); + Ok(datetime_to_timeline_format(&dt)) + } + TimelineTimezoneValue::Local => { + let dt = Local + .from_local_datetime(&date_time) + .single() + .ok_or_else(|| { + CoreError::TimestampParsingError("Ambiguous local datetime".to_string()) + })?; + Ok(datetime_to_timeline_format(&dt)) + } + } +} + +fn datetime_to_timeline_format(dt: &DateTime) -> String { + let year = dt.year(); + let month = dt.month(); + let day = dt.day(); + let hour = dt.hour(); + let minute = dt.minute(); + let second = dt.second(); + let timestamp_subsec_millis = dt.timestamp_subsec_millis(); + format!( + "{:04}{:02}{:02}{:02}{:02}{:02}{:03}", + year, month, day, hour, minute, second, timestamp_subsec_millis + ) +} From ab23658d3b6a8b3c7d32ef2ccbf2bc0d378c5581 Mon Sep 17 00:00:00 2001 From: Shiyan Xu <2701446+xushiyan@users.noreply.github.com> Date: Thu, 29 May 2025 02:41:17 -0500 Subject: [PATCH 6/7] minor update --- crates/core/src/timeline/util.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/core/src/timeline/util.rs b/crates/core/src/timeline/util.rs index 28fcb5ef..fb96d378 100644 --- a/crates/core/src/timeline/util.rs +++ b/crates/core/src/timeline/util.rs @@ -36,7 +36,7 @@ use std::str::FromStr; pub fn format_timestamp(ts_str: &str, timezone: &str) -> Result { let ts_str = ts_str.trim(); - if let Ok(formatted_ts) = parse_hudi_timeline_format(ts_str) { + if let Ok(formatted_ts) = parse_timeline_format(ts_str) { return Ok(formatted_ts); } @@ -60,11 +60,9 @@ pub fn format_timestamp(ts_str: &str, timezone: &str) -> Result { ))) } -/// Parse existing Hudi timeline format and ensure it's properly formatted -fn parse_hudi_timeline_format(ts_str: &str) -> Result { - if ts_str.len() == 14 && ts_str.chars().all(|c| c.is_ascii_digit()) { - return Ok(format!("{}000", ts_str)); - } else if ts_str.len() == 17 && ts_str.chars().all(|c| c.is_ascii_digit()) { +/// Parse the timestamp for timeline format +fn parse_timeline_format(ts_str: &str) -> Result { + if matches!(ts_str.len(), 14 | 17) && ts_str.chars().all(|c| c.is_ascii_digit()) { return Ok(ts_str.to_string()); } From 9de56442792c8698c21866e047f853d29936b640 Mon Sep 17 00:00:00 2001 From: Shiyan Xu <2701446+xushiyan@users.noreply.github.com> Date: Thu, 29 May 2025 14:37:25 -0500 Subject: [PATCH 7/7] add ut --- crates/core/src/timeline/util.rs | 466 ++++++++++++++++++++++++------- crates/test/src/util.rs | 12 +- 2 files changed, 377 insertions(+), 101 deletions(-) diff --git a/crates/core/src/timeline/util.rs b/crates/core/src/timeline/util.rs index fb96d378..2dea69e1 100644 --- a/crates/core/src/timeline/util.rs +++ b/crates/core/src/timeline/util.rs @@ -19,16 +19,16 @@ use crate::config::table::TimelineTimezoneValue; use crate::error::CoreError; use crate::Result; -use chrono::{DateTime, Datelike, Local, NaiveDate, NaiveDateTime, TimeZone, Timelike, Utc}; +use chrono::{DateTime, Datelike, Local, TimeZone, Timelike, Utc}; use std::str::FromStr; +use CoreError::TimestampParsingError; /// Parse various timestamp formats and convert to Hudi timeline format /// /// Supported formats: /// - Hudi timeline format: `yyyyMMddHHmmSSSSS` or `yyyyMMddHHmmSS` /// - Epoch time (seconds, milliseconds, microseconds, nanoseconds) -/// - ISO8601 formats with various precisions -/// - Date-only format: `YYYY-MM-DD` +/// - RFC3339 format like `2024-03-15T14:25:30Z` or `2024-03-15T14:25:30+00:00` or `2024-03-15` /// /// # Arguments /// * `ts_str` - The timestamp str to parse @@ -36,8 +36,11 @@ use std::str::FromStr; pub fn format_timestamp(ts_str: &str, timezone: &str) -> Result { let ts_str = ts_str.trim(); - if let Ok(formatted_ts) = parse_timeline_format(ts_str) { - return Ok(formatted_ts); + let mut parse_errors: Vec = vec![]; + + match parse_timeline_format(ts_str) { + Ok(formatted_ts) => return Ok(formatted_ts), + Err(e) => parse_errors.push(e), } let timezone = TimelineTimezoneValue::from_str(timezone)?; @@ -45,37 +48,42 @@ pub fn format_timestamp(ts_str: &str, timezone: &str) -> Result { if let Ok(formatted_ts) = parse_epoch_time(ts_str, &timezone) { return Ok(formatted_ts); } - - if let Ok(formatted_ts) = parse_iso8601_format(ts_str, &timezone) { - return Ok(formatted_ts); + match parse_epoch_time(ts_str, &timezone) { + Ok(formatted_ts) => return Ok(formatted_ts), + Err(e) => parse_errors.push(e), } - if let Ok(formatted_ts) = parse_date_only_format(ts_str, &timezone) { - return Ok(formatted_ts); + match parse_rfc3339_format(ts_str, &timezone) { + Ok(formatted_ts) => return Ok(formatted_ts), + Err(e) => parse_errors.push(e), } - Err(CoreError::TimestampParsingError(format!( - "Unable to parse timestamp: {}", - ts_str + Err(TimestampParsingError(format!( + "Unable to parse timestamp: {} due to errors: {:?}", + ts_str, parse_errors ))) } -/// Parse the timestamp for timeline format fn parse_timeline_format(ts_str: &str) -> Result { if matches!(ts_str.len(), 14 | 17) && ts_str.chars().all(|c| c.is_ascii_digit()) { return Ok(ts_str.to_string()); } - Err(CoreError::TimestampParsingError( + Err(TimestampParsingError( "Not a Hudi timeline format".to_string(), )) } -/// Parse epoch time in various precisions fn parse_epoch_time(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { - let ts: i64 = ts_str - .parse() - .map_err(|_| CoreError::TimestampParsingError(format!("Invalid epoch time: {}", ts_str)))?; + let ts: i64 = ts_str.parse().map_err(|e| { + TimestampParsingError(format!("Invalid epoch time: {} due to {:?}", ts_str, e)) + })?; + if ts < 0 { + return Err(TimestampParsingError(format!( + "Epoch time must be non-negative: {}", + ts_str + ))); + } let datetime = if ts_str.len() <= 10 { // Seconds @@ -99,98 +107,29 @@ fn parse_epoch_time(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result Err(CoreError::TimestampParsingError(format!( + None => Err(TimestampParsingError(format!( "Invalid epoch time: {}", ts_str ))), } } -/// Parse ISO8601 format timestamps -fn parse_iso8601_format(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { - // List of ISO8601 formats to try, ordered from the most specific to the least specific - let formats = [ - // With timezone offset - "%Y-%m-%dT%H:%M:%S%.9f%:z", // nanoseconds with timezone - "%Y-%m-%dT%H:%M:%S%.6f%:z", // microseconds with timezone - "%Y-%m-%dT%H:%M:%S%.3f%:z", // milliseconds with timezone - "%Y-%m-%dT%H:%M:%S%:z", // seconds with timezone - // With Z (UTC) - "%Y-%m-%dT%H:%M:%S%.9fZ", // nanoseconds with Z - "%Y-%m-%dT%H:%M:%S%.6fZ", // microseconds with Z - "%Y-%m-%dT%H:%M:%S%.3fZ", // milliseconds with Z - "%Y-%m-%dT%H:%M:%SZ", // seconds with Z - // Without timezone (ambiguous - use mode to determine interpretation) - "%Y-%m-%dT%H:%M:%S%.9f", // nanoseconds - "%Y-%m-%dT%H:%M:%S%.6f", // microseconds - "%Y-%m-%dT%H:%M:%S%.3f", // milliseconds - "%Y-%m-%dT%H:%M:%S", // seconds - ]; - - // Try parsing with timezone-aware formats first (these ignore the timezone config) - for format in &formats[..8] { - if let Ok(dt) = DateTime::parse_from_str(ts_str, format) { - return match timezone { - TimelineTimezoneValue::UTC => { - Ok(datetime_to_timeline_format(&dt.with_timezone(&Utc))) - } - TimelineTimezoneValue::Local => { - let local_dt = dt.with_timezone(&Local); - Ok(datetime_to_timeline_format(&local_dt)) - } - }; - } - } - - // Try parsing as naive datetime and interpret it according to the timezone config - for format in &formats[8..] { - if let Ok(naive_dt) = NaiveDateTime::parse_from_str(ts_str, format) { - return format_naive_date_time(naive_dt, timezone); - } - } - - Err(CoreError::TimestampParsingError( - "Not a valid ISO8601 format".to_string(), - )) -} - -/// Parse date-only format (YYYY-MM-DD) -fn parse_date_only_format(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { - if let Ok(date) = NaiveDate::parse_from_str(ts_str, "%Y-%m-%d") { - // Convert to datetime at start of day - let datetime = date - .and_hms_opt(0, 0, 0) - .ok_or_else(|| CoreError::TimestampParsingError("Invalid date".to_string()))?; - - return format_naive_date_time(datetime, timezone); +fn parse_rfc3339_format(ts_str: &str, timezone: &TimelineTimezoneValue) -> Result { + if let Ok(dt) = DateTime::parse_from_rfc3339(ts_str) { + return match timezone { + TimelineTimezoneValue::UTC => Ok(datetime_to_timeline_format(&dt.with_timezone(&Utc))), + TimelineTimezoneValue::Local => { + let local_dt = dt.with_timezone(&Local); + Ok(datetime_to_timeline_format(&local_dt)) + } + }; } - Err(CoreError::TimestampParsingError( - "Not a valid date format".to_string(), + Err(TimestampParsingError( + "Not a valid RFC3339 format".to_string(), )) } -fn format_naive_date_time( - date_time: NaiveDateTime, - timezone: &TimelineTimezoneValue, -) -> Result { - match timezone { - TimelineTimezoneValue::UTC => { - let dt = Utc.from_utc_datetime(&date_time); - Ok(datetime_to_timeline_format(&dt)) - } - TimelineTimezoneValue::Local => { - let dt = Local - .from_local_datetime(&date_time) - .single() - .ok_or_else(|| { - CoreError::TimestampParsingError("Ambiguous local datetime".to_string()) - })?; - Ok(datetime_to_timeline_format(&dt)) - } - } -} - fn datetime_to_timeline_format(dt: &DateTime) -> String { let year = dt.year(); let month = dt.month(); @@ -204,3 +143,330 @@ fn datetime_to_timeline_format(dt: &DateTime) -> String { year, month, day, hour, minute, second, timestamp_subsec_millis ) } + +#[cfg(test)] +mod tests { + use super::*; + use chrono::{TimeZone, Utc}; + use hudi_test::util::{reset_timezone, set_fixed_timezone}; + + fn set_singapore_timezone() { + set_fixed_timezone("Asia/Singapore"); + } + + #[test] + fn test_parse_timeline_format() { + // Valid 14-digit format + let result = parse_timeline_format("20240315142530").unwrap(); + assert_eq!(result, "20240315142530"); + + // Valid 17-digit format + let result = parse_timeline_format("20240315142530123").unwrap(); + assert_eq!(result, "20240315142530123"); + + // Invalid formats + assert!(parse_timeline_format("2024031514253").is_err()); // Wrong length + assert!(parse_timeline_format("202403151425301234").is_err()); // Wrong length + assert!(parse_timeline_format("2024031514253a").is_err()); // Non-digits + assert!(parse_timeline_format("").is_err()); // Empty + } + + #[test] + fn test_parse_epoch_time() { + // Test epoch time in seconds (10 digits or fewer) + let result = parse_epoch_time("1710512730", &TimelineTimezoneValue::UTC).unwrap(); + // March 15, 2024 14:25:30 UTC + assert_eq!(result, "20240315142530000"); + + // Test with Local timezone (Singapore UTC+8) + set_singapore_timezone(); + let result = parse_epoch_time("1710512730", &TimelineTimezoneValue::Local).unwrap(); + // March 15, 2024 14:25:30 UTC = March 15, 2024 22:25:30 Singapore time + assert_eq!(result, "20240315222530000"); + reset_timezone(); + + // Test milliseconds + let result = parse_epoch_time("1710512730123", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530123"); + + // Test microseconds + let result = parse_epoch_time("1710512730123456", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530123"); + + // Test nanoseconds + let result = parse_epoch_time("1710512730123456789", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530123"); + + // Invalid epoch time + assert!(parse_epoch_time("not_a_number", &TimelineTimezoneValue::UTC).is_err()); + assert!(parse_epoch_time("-1", &TimelineTimezoneValue::UTC).is_err()); + } + + #[test] + fn test_parse_rfc3339_format() { + // RFC3339 with timezone offset + let result = + parse_rfc3339_format("2024-03-15T14:25:30+00:00", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530000"); + + // RFC3339 with Z (UTC) + let result = + parse_rfc3339_format("2024-03-15T14:25:30Z", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530000"); + + // RFC3339 with milliseconds + let result = + parse_rfc3339_format("2024-03-15T14:25:30.123Z", &TimelineTimezoneValue::UTC).unwrap(); + assert_eq!(result, "20240315142530123"); + + // Test conversion to local timezone (Singapore) + set_singapore_timezone(); + let result = + parse_rfc3339_format("2024-03-15T14:25:30Z", &TimelineTimezoneValue::Local).unwrap(); + assert_eq!(result, "20240315222530000"); // Convert UTC to Singapore time + + // RFC3339 with different timezone offset + let result = + parse_rfc3339_format("2024-03-15T20:25:30+06:00", &TimelineTimezoneValue::Local) + .unwrap(); + assert_eq!(result, "20240315222530000"); // +06:00 = UTC+6, so 20:25:30+06:00 = 14:25:30Z = 22:25:30 Singapore + reset_timezone(); + + // Invalid RFC3339 format + assert!(parse_rfc3339_format("2024-03-15 14:25:30", &TimelineTimezoneValue::UTC).is_err()); + assert!(parse_rfc3339_format("2024/03/15T14:25:30Z", &TimelineTimezoneValue::UTC).is_err()); + assert!(parse_rfc3339_format("not-a-date", &TimelineTimezoneValue::UTC).is_err()); + } + + #[test] + fn test_datetime_to_timeline_format() { + // Test with UTC timezone + let dt = Utc + .with_ymd_and_hms(2024, 3, 15, 14, 25, 30) + .unwrap() + .with_nanosecond(123_000_000) + .unwrap(); + let result = datetime_to_timeline_format(&dt); + assert_eq!(result, "20240315142530123"); + + // Test with zero milliseconds + let dt = Utc.with_ymd_and_hms(2024, 3, 15, 14, 25, 30).unwrap(); + let result = datetime_to_timeline_format(&dt); + assert_eq!(result, "20240315142530000"); + + // Test with single-digit values + let dt = Utc.with_ymd_and_hms(2024, 1, 5, 9, 8, 7).unwrap(); + let result = datetime_to_timeline_format(&dt); + assert_eq!(result, "20240105090807000"); + } + + #[test] + fn test_format_timestamp_timeline_format() { + // Already in timeline format should be returned as-is + let result = format_timestamp("20240315142530", "UTC").unwrap(); + assert_eq!(result, "20240315142530"); + + let result = format_timestamp("20240315142530123", "UTC").unwrap(); + assert_eq!(result, "20240315142530123"); + + // Should work with Local timezone too (no conversion needed) + let result = format_timestamp("20240315142530", "Local").unwrap(); + assert_eq!(result, "20240315142530"); + } + + #[test] + fn test_format_timestamp_epoch_time() { + set_singapore_timezone(); + + // Epoch time with UTC + let result = format_timestamp("1710512730", "UTC").unwrap(); + assert_eq!(result, "20240315142530000"); + + // Epoch time with Local (Singapore) + let result = format_timestamp("1710512730", "Local").unwrap(); + assert_eq!(result, "20240315222530000"); + + // Epoch time with milliseconds + let result = format_timestamp("1710512730123", "UTC").unwrap(); + assert_eq!(result, "20240315142530123"); + + let result = format_timestamp("1710512730123", "Local").unwrap(); + assert_eq!(result, "20240315222530123"); + + reset_timezone(); + } + + #[test] + fn test_format_timestamp_rfc3339() { + set_singapore_timezone(); + + // RFC3339 with timezone info + let result = format_timestamp("2024-03-15T14:25:30Z", "UTC").unwrap(); + assert_eq!(result, "20240315142530000"); + + let result = format_timestamp("2024-03-15T14:25:30Z", "Local").unwrap(); + assert_eq!(result, "20240315222530000"); // Convert UTC to Singapore + + // RFC3339 with offset + let result = format_timestamp("2024-03-15T14:25:30+00:00", "UTC").unwrap(); + assert_eq!(result, "20240315142530000"); + + let result = format_timestamp("2024-03-15T14:25:30+00:00", "Local").unwrap(); + assert_eq!(result, "20240315222530000"); + + // RFC3339 with milliseconds + let result = format_timestamp("2024-03-15T14:25:30.123Z", "UTC").unwrap(); + assert_eq!(result, "20240315142530123"); + + let result = format_timestamp("2024-03-15T14:25:30.123Z", "Local").unwrap(); + assert_eq!(result, "20240315222530123"); + + reset_timezone(); + } + + #[test] + fn test_format_timestamp_comprehensive() { + set_singapore_timezone(); + + // Test all supported formats + let test_cases = vec![ + // Timeline format (no conversion) + ("20240315142530", "UTC", "20240315142530"), + ("20240315142530", "Local", "20240315142530"), + ("20240315142530123", "UTC", "20240315142530123"), + ("20240315142530123", "Local", "20240315142530123"), + // Epoch times + ("1710512730", "UTC", "20240315142530000"), // 14:25:30 UTC + ("1710512730", "Local", "20240315222530000"), // 22:25:30 Singapore + ("1710512730123", "UTC", "20240315142530123"), + ("1710512730123", "Local", "20240315222530123"), + // RFC3339 with timezone + ("2024-03-15T14:25:30Z", "UTC", "20240315142530000"), + ("2024-03-15T14:25:30Z", "Local", "20240315222530000"), + ("2024-03-15T14:25:30+00:00", "UTC", "20240315142530000"), + ("2024-03-15T14:25:30+00:00", "Local", "20240315222530000"), + ("2024-03-15T14:25:30.123Z", "UTC", "20240315142530123"), + ("2024-03-15T14:25:30.123Z", "Local", "20240315222530123"), + ]; + + for (input, timezone, expected) in test_cases { + let result = format_timestamp(input, timezone).unwrap(); + assert_eq!( + result, expected, + "Failed for input: {} with timezone: {}", + input, timezone + ); + } + + reset_timezone(); + } + + #[test] + fn test_format_timestamp_parsing_order() { + // Test that the timeline format is checked first (should not be parsed as epoch) + let result = format_timestamp("20240315142530", "UTC").unwrap(); + assert_eq!( + result, "20240315142530", + "Should be treated as the timeline format, not epoch" + ); + + // Test that epoch is checked before RFC3339 + let result = format_timestamp("1710512730", "UTC").unwrap(); + assert_eq!( + result, "20240315142530000", + "Should be treated as epoch time" + ); + } + + #[test] + fn test_format_timestamp_error_collection() { + // Test that invalid input collects all parsing errors + let result = format_timestamp("invalid-input", "UTC"); + assert!(result.is_err()); + + let error_msg = format!("{:?}", result.unwrap_err()); + assert!(error_msg.contains("Unable to parse timestamp: invalid-input")); + assert!(error_msg.contains("due to errors")); + } + + #[test] + fn test_format_timestamp_invalid_inputs() { + let invalid_inputs = vec![ + "invalid-timestamp", + "2024-13-45T14:25:30Z", // Invalid date in RFC3339 + "", + "2024/03/15", // Wrong format + "abcdefghijklmn", // Wrong length for timeline + ]; + + for input in invalid_inputs { + let result = format_timestamp(input, "UTC"); + assert!(result.is_err(), "Should fail for input: {}", input); + } + } + + #[test] + fn test_format_timestamp_invalid_timezone() { + let result = format_timestamp("2024-03-15T14:25:30Z", "InvalidTimezone"); + assert!(result.is_err()); + } + + #[test] + fn test_whitespace_handling() { + // Test that whitespace is properly trimmed + let result = format_timestamp(" 20240315142530 ", "UTC").unwrap(); + assert_eq!(result, "20240315142530"); + + let result = format_timestamp(" 2024-03-15T14:25:30Z ", "UTC").unwrap(); + assert_eq!(result, "20240315142530000"); + + let result = format_timestamp(" 1710512730 ", "UTC").unwrap(); + assert_eq!(result, "20240315142530000"); + } + + #[test] + fn test_timezone_conversion_consistency() { + set_singapore_timezone(); + + // Test that the same timestamp produces consistent results with different input formats + let timestamp_epoch = "1710512730"; // March 15, 2024 14:25:30 UTC + let timestamp_rfc3339 = "2024-03-15T14:25:30Z"; + + let epoch_utc = format_timestamp(timestamp_epoch, "UTC").unwrap(); + let epoch_local = format_timestamp(timestamp_epoch, "Local").unwrap(); + let rfc3339_utc = format_timestamp(timestamp_rfc3339, "UTC").unwrap(); + let rfc3339_local = format_timestamp(timestamp_rfc3339, "Local").unwrap(); + + // All UTC results should be the same + assert_eq!(epoch_utc, rfc3339_utc); + assert_eq!(epoch_utc, "20240315142530000"); + + // All Local results should be the same + assert_eq!(epoch_local, rfc3339_local); + assert_eq!(epoch_local, "20240315222530000"); // Singapore time + + reset_timezone(); + } + + #[test] + fn test_edge_cases() { + set_singapore_timezone(); + + // Test leap year date + let result = format_timestamp("2024-02-29T12:00:00Z", "UTC").unwrap(); + assert_eq!(result, "20240229120000000"); + + let result = format_timestamp("2024-02-29T12:00:00Z", "Local").unwrap(); + assert_eq!(result, "20240229200000000"); // Singapore time + + // Test end of year + let result = format_timestamp("2024-12-31T23:59:59Z", "UTC").unwrap(); + assert_eq!(result, "20241231235959000"); + + // Test beginning of year + let result = format_timestamp("2024-01-01T00:00:00Z", "UTC").unwrap(); + assert_eq!(result, "20240101000000000"); + + reset_timezone(); + } +} diff --git a/crates/test/src/util.rs b/crates/test/src/util.rs index a4fb5dcf..87c4889b 100644 --- a/crates/test/src/util.rs +++ b/crates/test/src/util.rs @@ -16,9 +16,9 @@ * specific language governing permissions and limitations * under the License. */ - use arrow::record_batch::RecordBatch; use arrow_array::{Array, BooleanArray, Int32Array, StringArray}; +use std::env; pub fn get_str_column<'a>(record_batch: &'a RecordBatch, name: &str) -> Vec<&'a str> { record_batch @@ -55,3 +55,13 @@ pub fn get_bool_column(record_batch: &RecordBatch, name: &str) -> Vec { .map(|s| s.unwrap()) .collect::>() } + +/// Sets a fixed timezone by setting the TZ environment variable. +pub fn set_fixed_timezone(tz: &str) { + env::set_var("TZ", tz); +} + +/// Resets the timezone to the system default by removing the TZ environment variable. +pub fn reset_timezone() { + env::remove_var("TZ"); +}