diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs index 8e5b3ad6c99..d91fef42229 100644 --- a/rust/arrow/src/csv/reader.rs +++ b/rust/arrow/src/csv/reader.rs @@ -65,6 +65,9 @@ lazy_static! { .case_insensitive(true) .build() .unwrap(); + static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); + static ref DATETIME_RE: Regex = + Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap(); } /// Infer the data type of a record @@ -81,6 +84,10 @@ fn infer_field_schema(string: &str) -> DataType { DataType::Float64 } else if INTEGER_RE.is_match(string) { DataType::Int64 + } else if DATETIME_RE.is_match(string) { + DataType::Date64(DateUnit::Millisecond) + } else if DATE_RE.is_match(string) { + DataType::Date32(DateUnit::Day) } else { DataType::Utf8 } @@ -436,6 +443,12 @@ fn parse( &DataType::Float64 => { build_primitive_array::(line_number, rows, i) } + &DataType::Date32(_) => { + build_primitive_array::(line_number, rows, i) + } + &DataType::Date64(_) => { + build_primitive_array::(line_number, rows, i) + } &DataType::Utf8 => { let mut builder = StringBuilder::new(rows.len()); for row in rows.iter() { @@ -496,6 +509,33 @@ impl Parser for Int16Type {} impl Parser for Int8Type {} +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let from_ymd = chrono::NaiveDate::from_ymd; + let since = chrono::NaiveDate::signed_duration_since; + + match Self::DATA_TYPE { + DataType::Date32(DateUnit::Day) => { + let days = string.parse::().ok()?; + Self::Native::from_i32(since(days, from_ymd(1970, 1, 1)).num_days() as i32) + } + _ => None, + } + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + match Self::DATA_TYPE { + DataType::Date64(DateUnit::Millisecond) => { + let date_time = string.parse::().ok()?; + Self::Native::from_i64(date_time.timestamp_millis()) + } + _ => None, + } + } +} + fn parse_item(string: &str) -> Option { T::parse(string) } @@ -929,13 +969,13 @@ mod tests { .has_header(true) .with_delimiter(b'|') .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3]); + .with_projection(vec![0, 1, 2, 3, 4, 5]); let mut csv = builder.build(file).unwrap(); let batch = csv.next().unwrap().unwrap(); assert_eq!(5, batch.num_rows()); - assert_eq!(4, batch.num_columns()); + assert_eq!(6, batch.num_columns()); let schema = batch.schema(); @@ -943,11 +983,35 @@ mod tests { assert_eq!(&DataType::Float64, schema.field(1).data_type()); assert_eq!(&DataType::Float64, schema.field(2).data_type()); assert_eq!(&DataType::Boolean, schema.field(3).data_type()); + assert_eq!( + &DataType::Date32(DateUnit::Day), + schema.field(4).data_type() + ); + assert_eq!( + &DataType::Date64(DateUnit::Millisecond), + schema.field(5).data_type() + ); + + let names: Vec<&str> = + schema.fields().iter().map(|x| x.name().as_str()).collect(); + assert_eq!( + names, + vec![ + "c_int", + "c_float", + "c_string", + "c_bool", + "c_date", + "c_datetime" + ] + ); assert_eq!(false, schema.field(0).is_nullable()); assert_eq!(true, schema.field(1).is_nullable()); assert_eq!(true, schema.field(2).is_nullable()); assert_eq!(false, schema.field(3).is_nullable()); + assert_eq!(true, schema.field(4).is_nullable()); + assert_eq!(true, schema.field(5).is_nullable()); assert_eq!(false, batch.column(1).is_null(0)); assert_eq!(false, batch.column(1).is_null(1)); @@ -995,6 +1059,38 @@ mod tests { assert_eq!(infer_field_schema("10.2"), DataType::Float64); assert_eq!(infer_field_schema("true"), DataType::Boolean); assert_eq!(infer_field_schema("false"), DataType::Boolean); + assert_eq!( + infer_field_schema("2020-11-08"), + DataType::Date32(DateUnit::Day) + ); + assert_eq!( + infer_field_schema("2020-11-08T14:20:01"), + DataType::Date64(DateUnit::Millisecond) + ); + } + + #[test] + fn parse_date32() { + assert_eq!(parse_item::("1970-01-01").unwrap(), 0); + assert_eq!(parse_item::("2020-03-15").unwrap(), 18336); + assert_eq!(parse_item::("1945-05-08").unwrap(), -9004); + } + + #[test] + fn parse_date64() { + assert_eq!(parse_item::("1970-01-01T00:00:00").unwrap(), 0); + assert_eq!( + parse_item::("2018-11-13T17:11:10").unwrap(), + 1542129070000 + ); + assert_eq!( + parse_item::("2018-11-13T17:11:10.011").unwrap(), + 1542129070011 + ); + assert_eq!( + parse_item::("1900-02-28T12:34:56").unwrap(), + -2203932304000 + ); } #[test] diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs index 01e9f96dc66..9ee37d23b2e 100644 --- a/rust/arrow/src/datatypes.rs +++ b/rust/arrow/src/datatypes.rs @@ -209,6 +209,16 @@ pub trait ArrowNativeType: fn to_usize(&self) -> Option { None } + + /// Convert native type from i32. + fn from_i32(_: i32) -> Option { + None + } + + /// Convert native type from i64. + fn from_i64(_: i64) -> Option { + None + } } /// Trait indicating a primitive fixed-width type (bool, ints and floats). @@ -278,6 +288,11 @@ impl ArrowNativeType for i32 { fn to_usize(&self) -> Option { num::ToPrimitive::to_usize(self) } + + /// Convert native type from i32. + fn from_i32(val: i32) -> Option { + Some(val) + } } impl ArrowNativeType for i64 { @@ -292,6 +307,11 @@ impl ArrowNativeType for i64 { fn to_usize(&self) -> Option { num::ToPrimitive::to_usize(self) } + + /// Convert native type from i64. + fn from_i64(val: i64) -> Option { + Some(val) + } } impl ArrowNativeType for u8 { @@ -1333,18 +1353,16 @@ impl Field { )); } match data_type { - DataType::List(_) => DataType::List(Box::new( - Self::from(&values[0])?, - )), - DataType::LargeList(_) => DataType::LargeList(Box::new( - Self::from(&values[0])?, - )), - DataType::FixedSizeList(_, int) => { - DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), - int, - ) + DataType::List(_) => { + DataType::List(Box::new(Self::from(&values[0])?)) + } + DataType::LargeList(_) => { + DataType::LargeList(Box::new(Self::from(&values[0])?)) } + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Box::new(Self::from(&values[0])?), + int, + ), _ => unreachable!( "Data type should be a list, largelist or fixedsizelist" ), diff --git a/rust/arrow/test/data/various_types.csv b/rust/arrow/test/data/various_types.csv index 322d9c347aa..8f4466fbe6a 100644 --- a/rust/arrow/test/data/various_types.csv +++ b/rust/arrow/test/data/various_types.csv @@ -1,6 +1,6 @@ -c_int|c_float|c_string|c_bool -1|1.1|"1.11"|true -2|2.2|"2.22"|true -3||"3.33"|true -4|4.4||false -5|6.6|""|false \ No newline at end of file +c_int|c_float|c_string|c_bool|c_date|c_datetime +1|1.1|"1.11"|true|1970-01-01|1970-01-01T00:00:00 +2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00 +3||"3.33"|true|1969-12-31|1969-11-08T02:00:00 +4|4.4||false|| +5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 \ No newline at end of file