From 3530904b2093de728e6d1d34d3b936e84e6f7ca8 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 21 May 2024 02:30:49 +0000 Subject: [PATCH 1/4] Fixes bug expect `Date32Array` but returns Int32Array --- .../physical_plan/parquet/statistics.rs | 9 +++++ .../core/tests/parquet/arrow_statistics.rs | 35 ++++++++++++------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 0ebf7dfe2384..eeb1735dc1cb 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -75,6 +75,12 @@ macro_rules! get_statistic { *scale, )) } + Some(DataType::Date32) => { + Some(ScalarValue::Date32(Some(*s.$func()))) + } + Some(DataType::Date64) => { + Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000))) + } _ => Some(ScalarValue::Int32(Some(*s.$func()))), } } @@ -88,6 +94,9 @@ macro_rules! get_statistic { *scale, )) } + Some(DataType::Date64) => { + Some(ScalarValue::Date64(Some((*s.$func())))) + } _ => Some(ScalarValue::Int64(Some(*s.$func()))), } } diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 272afea7b28a..e611d73eb04f 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -21,9 +21,11 @@ use std::fs::File; use std::sync::Arc; +use arrow::compute::kernels::cast_utils::Parser; +use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ - make_array, Array, ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, - RecordBatch, UInt64Array, + make_array, Array, ArrayRef, Date32Array, Date64Array, Int16Array, Int32Array, + Int64Array, Int8Array, RecordBatch, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::parquet::{ @@ -577,8 +579,6 @@ async fn test_timestamp_diff_rg_sizes() { } // date with different row group sizes -// Bug expect `Date32Array` but returns Int32Array -// https://github.com/apache/datafusion/issues/10587 #[tokio::test] async fn test_dates_32_diff_rg_sizes() { let row_per_group = 13; @@ -593,10 +593,16 @@ async fn test_dates_32_diff_rg_sizes() { Test { reader, - // mins are [18262, 18565,] - expected_min: Arc::new(Int32Array::from(vec![18262, 18565])), - // maxes are [18564, 21865,] - expected_max: Arc::new(Int32Array::from(vec![18564, 21865])), + // mins are [2020-01-01, 2020-10-30] + expected_min: Arc::new(Date32Array::from(vec![ + Date32Type::parse("2020-01-01"), + Date32Type::parse("2020-10-30"), + ])), + // maxes are [2020-10-29, 2029-11-12] + expected_max: Arc::new(Date32Array::from(vec![ + Date32Type::parse("2020-10-29"), + Date32Type::parse("2029-11-12"), + ])), // nulls are [2, 2] expected_null_counts: UInt64Array::from(vec![2, 2]), // row counts are [13, 7] @@ -605,9 +611,6 @@ async fn test_dates_32_diff_rg_sizes() { .run("date32"); } -// BUG: same as above. Expect to return Date64Array but returns Int32Array -// test date with different row group sizes -// https://github.com/apache/datafusion/issues/10587 #[ignore] #[tokio::test] async fn test_dates_64_diff_rg_sizes() { @@ -616,8 +619,14 @@ async fn test_dates_64_diff_rg_sizes() { let reader = parquet_file_many_columns(Scenario::Dates, row_per_group).await; Test { reader, - expected_min: Arc::new(Int64Array::from(vec![18262, 18565])), // panic here because the actual data is Int32Array - expected_max: Arc::new(Int64Array::from(vec![18564, 21865])), + expected_min: Arc::new(Date64Array::from(vec![ + Date64Type::parse("2020-01-01"), + Date64Type::parse("2020-10-30"), + ])), + expected_max: Arc::new(Date64Array::from(vec![ + Date64Type::parse("2020-10-29"), + Date64Type::parse("2029-11-12"), + ])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: UInt64Array::from(vec![13, 7]), } From b451329023d0257bf6dd2bbab6dc14627927d462 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 21 May 2024 03:07:15 +0000 Subject: [PATCH 2/4] Add round trip ut --- .../physical_plan/parquet/statistics.rs | 90 ++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index eeb1735dc1cb..fd4094f0cf2e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -372,10 +372,12 @@ impl<'a> StatisticsConverter<'a> { #[cfg(test)] mod test { use super::*; + use arrow::compute::kernels::cast_utils::Parser; + use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ - new_null_array, Array, BinaryArray, BooleanArray, Decimal128Array, Float32Array, - Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, StructArray, - TimestampNanosecondArray, + new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, + StringArray, StructArray, TimestampNanosecondArray, }; use arrow_schema::{Field, SchemaRef}; use bytes::Bytes; @@ -673,6 +675,68 @@ mod test { .run() } + #[test] + fn roundtrip_date32() { + Test { + input: date32_array(vec![ + // row group 1 + Some("2021-01-01"), + None, + Some("2021-01-03"), + // row group 2 + Some("2021-01-01"), + Some("2021-01-05"), + None, + // row group 3 + None, + None, + None, + ]), + expected_min: date32_array(vec![ + Some("2021-01-01"), + Some("2021-01-01"), + None, + ]), + expected_max: date32_array(vec![ + Some("2021-01-03"), + Some("2021-01-05"), + None, + ]), + } + .run() + } + + #[test] + fn roundtrip_date64() { + Test { + input: date64_array(vec![ + // row group 1 + Some("2021-01-01"), + None, + Some("2021-01-03"), + // row group 2 + Some("2021-01-01"), + Some("2021-01-05"), + None, + // row group 3 + None, + None, + None, + ]), + expected_min: date64_array(vec![ + Some("2021-01-01"), + Some("2021-01-01"), + None, + ]), + expected_max: date64_array(vec![ + Some("2021-01-03"), + Some("2021-01-05"), + None, + ]), + } + .run() + } + #[test] fn struct_and_non_struct() { // Ensures that statistics for an array that appears *after* a struct @@ -1078,4 +1142,24 @@ mod test { ]); Arc::new(struct_array) } + + fn date32_array<'a>(input: impl IntoIterator>) -> ArrayRef { + let array = Date32Array::from( + input + .into_iter() + .map(|s| Date32Type::parse(s.unwrap_or_default())) + .collect::>(), + ); + Arc::new(array) + } + + fn date64_array<'a>(input: impl IntoIterator>) -> ArrayRef { + let array = Date64Array::from( + input + .into_iter() + .map(|s| Date64Type::parse(s.unwrap_or_default())) + .collect::>(), + ); + Arc::new(array) + } } From f13b5b772a455ce0fa05af4be1e193ab8c2f7013 Mon Sep 17 00:00:00 2001 From: Xin Li <33629085+xinlifoobar@users.noreply.github.com> Date: Tue, 21 May 2024 22:15:08 +0800 Subject: [PATCH 3/4] Update arrow_statistics.rs --- datafusion/core/tests/parquet/arrow_statistics.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index e611d73eb04f..4ee722fcb822 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -611,7 +611,6 @@ async fn test_dates_32_diff_rg_sizes() { .run("date32"); } -#[ignore] #[tokio::test] async fn test_dates_64_diff_rg_sizes() { let row_per_group = 13; From 43cf4352440f56ee6cdcc0067ec886beded1bee6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 22 May 2024 15:26:28 -0400 Subject: [PATCH 4/4] remove unreachable code --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index fd4094f0cf2e..d0ecb86f9e08 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -94,9 +94,6 @@ macro_rules! get_statistic { *scale, )) } - Some(DataType::Date64) => { - Some(ScalarValue::Date64(Some((*s.$func())))) - } _ => Some(ScalarValue::Int64(Some(*s.$func()))), } }