Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 98 additions & 2 deletions rust/arrow/src/csv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ lazy_static! {
.case_insensitive(true)
.build()
.unwrap();
static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap();
static ref DATETIME_RE: Regex =
Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap();
}

/// Infer the data type of a record
Expand All @@ -81,6 +84,10 @@ fn infer_field_schema(string: &str) -> DataType {
DataType::Float64
} else if INTEGER_RE.is_match(string) {
DataType::Int64
} else if DATETIME_RE.is_match(string) {
DataType::Date64(DateUnit::Millisecond)
} else if DATE_RE.is_match(string) {
DataType::Date32(DateUnit::Day)
} else {
DataType::Utf8
}
Expand Down Expand Up @@ -436,6 +443,12 @@ fn parse(
&DataType::Float64 => {
build_primitive_array::<Float64Type>(line_number, rows, i)
}
&DataType::Date32(_) => {
build_primitive_array::<Date32Type>(line_number, rows, i)
}
&DataType::Date64(_) => {
build_primitive_array::<Date64Type>(line_number, rows, i)
}
&DataType::Utf8 => {
let mut builder = StringBuilder::new(rows.len());
for row in rows.iter() {
Expand Down Expand Up @@ -496,6 +509,33 @@ impl Parser for Int16Type {}

impl Parser for Int8Type {}

impl Parser for Date32Type {
fn parse(string: &str) -> Option<i32> {
let from_ymd = chrono::NaiveDate::from_ymd;
let since = chrono::NaiveDate::signed_duration_since;

match Self::DATA_TYPE {
DataType::Date32(DateUnit::Day) => {
let days = string.parse::<chrono::NaiveDate>().ok()?;
Self::Native::from_i32(since(days, from_ymd(1970, 1, 1)).num_days() as i32)
}
_ => None,
}
}
}

impl Parser for Date64Type {
fn parse(string: &str) -> Option<i64> {
match Self::DATA_TYPE {
DataType::Date64(DateUnit::Millisecond) => {
let date_time = string.parse::<chrono::NaiveDateTime>().ok()?;
Self::Native::from_i64(date_time.timestamp_millis())
}
_ => None,
}
}
}

fn parse_item<T: Parser>(string: &str) -> Option<T::Native> {
T::parse(string)
}
Expand Down Expand Up @@ -929,25 +969,49 @@ mod tests {
.has_header(true)
.with_delimiter(b'|')
.with_batch_size(512)
.with_projection(vec![0, 1, 2, 3]);
.with_projection(vec![0, 1, 2, 3, 4, 5]);

let mut csv = builder.build(file).unwrap();
let batch = csv.next().unwrap().unwrap();

assert_eq!(5, batch.num_rows());
assert_eq!(4, batch.num_columns());
assert_eq!(6, batch.num_columns());

let schema = batch.schema();

assert_eq!(&DataType::Int64, schema.field(0).data_type());
assert_eq!(&DataType::Float64, schema.field(1).data_type());
assert_eq!(&DataType::Float64, schema.field(2).data_type());
assert_eq!(&DataType::Boolean, schema.field(3).data_type());
assert_eq!(
&DataType::Date32(DateUnit::Day),
schema.field(4).data_type()
);
assert_eq!(
&DataType::Date64(DateUnit::Millisecond),
schema.field(5).data_type()
);

let names: Vec<&str> =
schema.fields().iter().map(|x| x.name().as_str()).collect();
assert_eq!(
names,
vec![
"c_int",
"c_float",
"c_string",
"c_bool",
"c_date",
"c_datetime"
]
);

assert_eq!(false, schema.field(0).is_nullable());
assert_eq!(true, schema.field(1).is_nullable());
assert_eq!(true, schema.field(2).is_nullable());
assert_eq!(false, schema.field(3).is_nullable());
assert_eq!(true, schema.field(4).is_nullable());
assert_eq!(true, schema.field(5).is_nullable());

assert_eq!(false, batch.column(1).is_null(0));
assert_eq!(false, batch.column(1).is_null(1));
Expand Down Expand Up @@ -995,6 +1059,38 @@ mod tests {
assert_eq!(infer_field_schema("10.2"), DataType::Float64);
assert_eq!(infer_field_schema("true"), DataType::Boolean);
assert_eq!(infer_field_schema("false"), DataType::Boolean);
assert_eq!(
infer_field_schema("2020-11-08"),
DataType::Date32(DateUnit::Day)
);
assert_eq!(
infer_field_schema("2020-11-08T14:20:01"),
DataType::Date64(DateUnit::Millisecond)
);
}

#[test]
fn parse_date32() {
assert_eq!(parse_item::<Date32Type>("1970-01-01").unwrap(), 0);
assert_eq!(parse_item::<Date32Type>("2020-03-15").unwrap(), 18336);
assert_eq!(parse_item::<Date32Type>("1945-05-08").unwrap(), -9004);
}

#[test]
fn parse_date64() {
assert_eq!(parse_item::<Date64Type>("1970-01-01T00:00:00").unwrap(), 0);
assert_eq!(
parse_item::<Date64Type>("2018-11-13T17:11:10").unwrap(),
1542129070000
);
assert_eq!(
parse_item::<Date64Type>("2018-11-13T17:11:10.011").unwrap(),
1542129070011
);
assert_eq!(
parse_item::<Date64Type>("1900-02-28T12:34:56").unwrap(),
-2203932304000
);
}

#[test]
Expand Down
40 changes: 29 additions & 11 deletions rust/arrow/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,16 @@ pub trait ArrowNativeType:
fn to_usize(&self) -> Option<usize> {
None
}

/// Convert native type from i32.
fn from_i32(_: i32) -> Option<Self> {
None
}

/// Convert native type from i64.
fn from_i64(_: i64) -> Option<Self> {
None
}
}

/// Trait indicating a primitive fixed-width type (bool, ints and floats).
Expand Down Expand Up @@ -278,6 +288,11 @@ impl ArrowNativeType for i32 {
fn to_usize(&self) -> Option<usize> {
num::ToPrimitive::to_usize(self)
}

/// Convert native type from i32.
fn from_i32(val: i32) -> Option<Self> {
Some(val)
}
}

impl ArrowNativeType for i64 {
Expand All @@ -292,6 +307,11 @@ impl ArrowNativeType for i64 {
fn to_usize(&self) -> Option<usize> {
num::ToPrimitive::to_usize(self)
}

/// Convert native type from i64.
fn from_i64(val: i64) -> Option<Self> {
Some(val)
}
}

impl ArrowNativeType for u8 {
Expand Down Expand Up @@ -1333,18 +1353,16 @@ impl Field {
));
}
match data_type {
DataType::List(_) => DataType::List(Box::new(
Self::from(&values[0])?,
)),
DataType::LargeList(_) => DataType::LargeList(Box::new(
Self::from(&values[0])?,
)),
DataType::FixedSizeList(_, int) => {
DataType::FixedSizeList(
Box::new(Self::from(&values[0])?),
int,
)
DataType::List(_) => {
DataType::List(Box::new(Self::from(&values[0])?))
}
DataType::LargeList(_) => {
DataType::LargeList(Box::new(Self::from(&values[0])?))
}
DataType::FixedSizeList(_, int) => DataType::FixedSizeList(
Box::new(Self::from(&values[0])?),
int,
),
_ => unreachable!(
"Data type should be a list, largelist or fixedsizelist"
),
Expand Down
12 changes: 6 additions & 6 deletions rust/arrow/test/data/various_types.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
c_int|c_float|c_string|c_bool
1|1.1|"1.11"|true
2|2.2|"2.22"|true
3||"3.33"|true
4|4.4||false
5|6.6|""|false
c_int|c_float|c_string|c_bool|c_date|c_datetime
1|1.1|"1.11"|true|1970-01-01|1970-01-01T00:00:00
2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00
3||"3.33"|true|1969-12-31|1969-11-08T02:00:00
4|4.4||false||
5|6.6|""|false|1990-01-01|1990-01-01T03:00:00