Skip to content

Commit

Permalink
Minor: Add tests for extracting dictionary parquet statistics (#10729)
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Jun 3, 2024
1 parent 6e5344a commit fbbab6c
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 3 deletions.
40 changes: 38 additions & 2 deletions datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1231,8 +1231,44 @@ async fn test_decimal() {
.run();
}

// BUG: not convert BinaryArray to StringArray
// https://github.com/apache/datafusion/issues/10605
#[tokio::test]
async fn test_dictionary() {
let reader = TestReader {
scenario: Scenario::Dictionary,
row_per_group: 5,
};

Test {
reader: reader.build().await,
expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
expected_null_counts: UInt64Array::from(vec![1, 0]),
expected_row_counts: UInt64Array::from(vec![5, 2]),
column_name: "string_dict_i8",
}
.run();

Test {
reader: reader.build().await,
expected_min: Arc::new(StringArray::from(vec!["abc", "aaa"])),
expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])),
expected_null_counts: UInt64Array::from(vec![1, 0]),
expected_row_counts: UInt64Array::from(vec![5, 2]),
column_name: "string_dict_i32",
}
.run();

Test {
reader: reader.build().await,
expected_min: Arc::new(Int64Array::from(vec![-100, 0])),
expected_max: Arc::new(Int64Array::from(vec![0, 100])),
expected_null_counts: UInt64Array::from(vec![1, 0]),
expected_row_counts: UInt64Array::from(vec![5, 2]),
column_name: "int_dict_i8",
}
.run();
}

#[tokio::test]
async fn test_byte() {
// This creates a parquet file of 4 columns
Expand Down
44 changes: 43 additions & 1 deletion datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ use arrow::{
record_batch::RecordBatch,
util::pretty::pretty_format_batches,
};
use arrow_array::{make_array, BooleanArray, Float32Array, StructArray};
use arrow_array::types::{Int32Type, Int8Type};
use arrow_array::{make_array, BooleanArray, DictionaryArray, Float32Array, StructArray};
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{physical_plan::ParquetExec, provider_as_source, TableProvider},
Expand Down Expand Up @@ -81,7 +82,10 @@ enum Scenario {
DecimalBloomFilterInt64,
DecimalLargePrecision,
DecimalLargePrecisionBloomFilter,
/// StringArray, BinaryArray, FixedSizeBinaryArray
ByteArray,
/// DictionaryArray
Dictionary,
PeriodsInColumnNames,
WithNullValues,
WithNullValuesPageLevel,
Expand Down Expand Up @@ -783,6 +787,41 @@ fn make_numeric_limit_batch() -> RecordBatch {
.unwrap()
}

fn make_dict_batch() -> RecordBatch {
let values = [
Some("abc"),
Some("def"),
None,
Some("def"),
Some("abc"),
Some("fffff"),
Some("aaa"),
];
let dict_i8_array = DictionaryArray::<Int8Type>::from_iter(values.iter().cloned());
let dict_i32_array = DictionaryArray::<Int32Type>::from_iter(values.iter().cloned());

// Dictionary array of integers
let int64_values = Int64Array::from(vec![0, -100, 100]);
let keys = Int8Array::from_iter([
Some(0),
Some(1),
None,
Some(0),
Some(0),
Some(2),
Some(0),
]);
let dict_i8_int_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(int64_values)).unwrap();

RecordBatch::try_from_iter(vec![
("string_dict_i8", Arc::new(dict_i8_array) as _),
("string_dict_i32", Arc::new(dict_i32_array) as _),
("int_dict_i8", Arc::new(dict_i8_int_array) as _),
])
.unwrap()
}

fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
match scenario {
Scenario::Boolean => {
Expand Down Expand Up @@ -954,6 +993,9 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
),
]
}
Scenario::Dictionary => {
vec![make_dict_batch()]
}
Scenario::PeriodsInColumnNames => {
vec![
// all frontend
Expand Down

0 comments on commit fbbab6c

Please sign in to comment.