Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove test duplication in parquet statistics tets #6185

Closed
5 tasks
alamb opened this issue Aug 2, 2024 · 1 comment · Fixed by #6190
Closed
5 tasks

Remove test duplication in parquet statistics tets #6185

alamb opened this issue Aug 2, 2024 · 1 comment · Fixed by #6190
Labels
enhancement Any new improvement worthy of a entry in the changelog parquet Changes to the parquet crate

Comments

@alamb
Copy link
Contributor

alamb commented Aug 2, 2024

Is your feature request related to a problem or challenge? Please describe what you are trying to do.

When adding additional support to the statistics converter as @Kev1n8 did in #6181 the presence of two sets of tests is quite confusing about where to add tests for the new test

There are tests in both

Describe the solution you'd like
I would like to remove the rundant tests

Describe alternatives you've considered
I think the clearest alternateive is to move all the tests from https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs to https://github.com/apache/arrow-rs/blob/master/parquet/tests/arrow_reader/statistics.rs

We should only move the non duplicated tests.

So that means something like:

  • Leave a comment with a note about where the tests are in https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_reader/statistics.rs
  • Keep edge condition tests
    #[test]
    fn roundtrip_empty() {
    let empty_bool_array = new_empty_array(&DataType::Boolean);
    Test {
    input: empty_bool_array.clone(),
    expected_min: empty_bool_array.clone(),
    expected_max: empty_bool_array.clone(),
    }
    .run()
    }
  • Remove tests in
    fn roundtrip_int32() {
    Test {
    input: i32_array([
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(0),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: i32_array([Some(1), Some(0), None]),
    expected_max: i32_array([Some(3), Some(5), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_int64() {
    Test {
    input: i64_array([
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(0),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: i64_array([Some(1), Some(0), None]),
    expected_max: i64_array(vec![Some(3), Some(5), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_f32() {
    Test {
    input: f32_array([
    // row group 1
    Some(1.0),
    None,
    Some(3.0),
    // row group 2
    Some(-1.0),
    Some(5.0),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: f32_array([Some(1.0), Some(-1.0), None]),
    expected_max: f32_array([Some(3.0), Some(5.0), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_f64() {
    Test {
    input: f64_array([
    // row group 1
    Some(1.0),
    None,
    Some(3.0),
    // row group 2
    Some(-1.0),
    Some(5.0),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: f64_array([Some(1.0), Some(-1.0), None]),
    expected_max: f64_array([Some(3.0), Some(5.0), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_timestamp() {
    Test {
    input: timestamp_seconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_seconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_seconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_milliseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_microseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None),
    }
    .run();
    Test {
    input: timestamp_nanoseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    None,
    ),
    expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None),
    expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None),
    }
    .run()
    }
    #[test]
    fn roundtrip_timestamp_timezoned() {
    Test {
    input: timestamp_seconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_milliseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_microseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_microseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_microseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run();
    Test {
    input: timestamp_nanoseconds_array(
    [
    // row group 1
    Some(1),
    None,
    Some(3),
    // row group 2
    Some(9),
    Some(5),
    None,
    // row group 3
    None,
    None,
    None,
    ],
    Some("UTC"),
    ),
    expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], Some("UTC")),
    expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], Some("UTC")),
    }
    .run()
    }
    #[test]
    fn roundtrip_decimal() {
    Test {
    input: Arc::new(
    Decimal128Array::from(vec![
    // row group 1
    Some(100),
    None,
    Some(22000),
    // row group 2
    Some(500000),
    Some(330000),
    None,
    // row group 3
    None,
    None,
    None,
    ])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    expected_min: Arc::new(
    Decimal128Array::from(vec![Some(100), Some(330000), None])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal128Array::from(vec![Some(22000), Some(500000), None])
    .with_precision_and_scale(9, 2)
    .unwrap(),
    ),
    }
    .run();
    Test {
    input: Arc::new(
    Decimal256Array::from(vec![
    // row group 1
    Some(i256::from(100)),
    None,
    Some(i256::from(22000)),
    // row group 2
    Some(i256::MAX),
    Some(i256::MIN),
    None,
    // row group 3
    None,
    None,
    None,
    ])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    expected_min: Arc::new(
    Decimal256Array::from(vec![Some(i256::from(100)), Some(i256::MIN), None])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal256Array::from(vec![Some(i256::from(22000)), Some(i256::MAX), None])
    .with_precision_and_scale(76, 76)
    .unwrap(),
    ),
    }
    .run()
    }
    #[test]
    fn roundtrip_utf8() {
    Test {
    input: utf8_array([
    // row group 1
    Some("A"),
    None,
    Some("Q"),
    // row group 2
    Some("ZZ"),
    Some("AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: utf8_array([Some("A"), Some("AA"), None]),
    expected_max: utf8_array([Some("Q"), Some("ZZ"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_struct() {
    let mut test = Test {
    input: struct_array(vec![
    // row group 1
    (Some(true), Some(1)),
    (None, None),
    (Some(true), Some(3)),
    // row group 2
    (Some(true), Some(0)),
    (Some(false), Some(5)),
    (None, None),
    // row group 3
    (None, None),
    (None, None),
    (None, None),
    ]),
    expected_min: struct_array(vec![
    (Some(true), Some(1)),
    (Some(true), Some(0)),
    (None, None),
    ]),
    expected_max: struct_array(vec![
    (Some(true), Some(3)),
    (Some(true), Some(0)),
    (None, None),
    ]),
    };
    // Due to https://github.com/apache/datafusion/issues/8334,
    // statistics for struct arrays are not supported
    test.expected_min = new_null_array(test.input.data_type(), test.expected_min.len());
    test.expected_max = new_null_array(test.input.data_type(), test.expected_min.len());
    test.run()
    }
    #[test]
    fn roundtrip_binary() {
    Test {
    input: Arc::new(BinaryArray::from_opt_vec(vec![
    // row group 1
    Some(b"A"),
    None,
    Some(b"Q"),
    // row group 2
    Some(b"ZZ"),
    Some(b"AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ])),
    expected_min: Arc::new(BinaryArray::from_opt_vec(vec![
    Some(b"A"),
    Some(b"AA"),
    None,
    ])),
    expected_max: Arc::new(BinaryArray::from_opt_vec(vec![
    Some(b"Q"),
    Some(b"ZZ"),
    None,
    ])),
    }
    .run()
    }
    #[test]
    fn roundtrip_date32() {
    Test {
    input: date32_array(vec![
    // row group 1
    Some("2021-01-01"),
    None,
    Some("2021-01-03"),
    // row group 2
    Some("2021-01-01"),
    Some("2021-01-05"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: date32_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]),
    expected_max: date32_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_date64() {
    Test {
    input: date64_array(vec![
    // row group 1
    Some("2021-01-01"),
    None,
    Some("2021-01-03"),
    // row group 2
    Some("2021-01-01"),
    Some("2021-01-05"),
    None,
    // row group 3
    None,
    None,
    None,
    ]),
    expected_min: date64_array(vec![Some("2021-01-01"), Some("2021-01-01"), None]),
    expected_max: date64_array(vec![Some("2021-01-03"), Some("2021-01-05"), None]),
    }
    .run()
    }
    #[test]
    fn roundtrip_large_binary_array() {
    let input: Vec<Option<&[u8]>> = vec![
    // row group 1
    Some(b"A"),
    None,
    Some(b"Q"),
    // row group 2
    Some(b"ZZ"),
    Some(b"AA"),
    None,
    // row group 3
    None,
    None,
    None,
    ];
    let expected_min: Vec<Option<&[u8]>> = vec![Some(b"A"), Some(b"AA"), None];
    let expected_max: Vec<Option<&[u8]>> = vec![Some(b"Q"), Some(b"ZZ"), None];
    Test {
    input: large_binary_array(input),
    expected_min: large_binary_array(expected_min),
    expected_max: large_binary_array(expected_max),
    }
    .run();
    }
    #[test]
    fn struct_and_non_struct() {
    // Ensures that statistics for an array that appears *after* a struct
    // array are not wrong
    let struct_col = struct_array(vec![
    // row group 1
    (Some(true), Some(1)),
    (None, None),
    (Some(true), Some(3)),
    ]);
    let int_col = i32_array([Some(100), Some(200), Some(300)]);
    let expected_min = i32_array([Some(100)]);
    let expected_max = i32_array(vec![Some(300)]);
    // use a name that shadows a name in the struct column
    match struct_col.data_type() {
    DataType::Struct(fields) => {
    assert_eq!(fields.get(1).unwrap().name(), "int_col")
    }
    _ => panic!("unexpected data type for struct column"),
    };
    let input_batch =
    RecordBatch::try_from_iter([("struct_col", struct_col), ("int_col", int_col)]).unwrap();
    let schema = input_batch.schema();
    let metadata = parquet_metadata(schema.clone(), input_batch);
    let parquet_schema = metadata.file_metadata().schema_descr();
    // read the int_col statistics
    let (idx, _) = parquet_column(parquet_schema, &schema, "int_col").unwrap();
    assert_eq!(idx, 2);
    let row_groups = metadata.row_groups();
    let converter = StatisticsConverter::try_new("int_col", &schema, parquet_schema).unwrap();
    let min = converter.row_group_mins(row_groups.iter()).unwrap();
    assert_eq!(
    &min,
    &expected_min,
    "Min. Statistics\n\n{}\n\n",
    DisplayStats(row_groups)
    );
    let max = converter.row_group_maxes(row_groups.iter()).unwrap();
    assert_eq!(
    &max,
    &expected_max,
    "Max. Statistics\n\n{}\n\n",
    DisplayStats(row_groups)
    );
    }
    that have corresponding coverage
  • Keep data file validation:
    fn nan_in_stats() {
    // /parquet-testing/data/nan_in_stats.parquet
    // row_groups: 1
    // "x": Double({min: Some(1.0), max: Some(NaN), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    TestFile::new("nan_in_stats.parquet")
    .with_column(ExpectedColumn {
    name: "x",
    expected_min: Arc::new(Float64Array::from(vec![Some(1.0)])),
    expected_max: Arc::new(Float64Array::from(vec![Some(f64::NAN)])),
    })
    .run();
    }
    #[test]
    fn alltypes_plain() {
    // /parquet-testing/data/datapage_v1-snappy-compressed-checksum.parquet
    // row_groups: 1
    // (has no statistics)
    TestFile::new("alltypes_plain.parquet")
    // No column statistics should be read as NULL, but with the right type
    .with_column(ExpectedColumn {
    name: "id",
    expected_min: i32_array([None]),
    expected_max: i32_array([None]),
    })
    .with_column(ExpectedColumn {
    name: "bool_col",
    expected_min: bool_array([None]),
    expected_max: bool_array([None]),
    })
    .run();
    }
    #[test]
    fn alltypes_tiny_pages() {
    // /parquet-testing/data/alltypes_tiny_pages.parquet
    // row_groups: 1
    // "id": Int32({min: Some(0), max: Some(7299), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "bool_col": Boolean({min: Some(false), max: Some(true), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "tinyint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "smallint_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "int_col": Int32({min: Some(0), max: Some(9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "bigint_col": Int64({min: Some(0), max: Some(90), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "float_col": Float({min: Some(0.0), max: Some(9.9), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "double_col": Double({min: Some(0.0), max: Some(90.89999999999999), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "date_string_col": ByteArray({min: Some(ByteArray { data: "01/01/09" }), max: Some(ByteArray { data: "12/31/10" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "string_col": ByteArray({min: Some(ByteArray { data: "0" }), max: Some(ByteArray { data: "9" }), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "timestamp_col": Int96({min: None, max: None, distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true})
    // "year": Int32({min: Some(2009), max: Some(2010), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    // "month": Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 0, min_max_deprecated: false, min_max_backwards_compatible: false})
    TestFile::new("alltypes_tiny_pages.parquet")
    .with_column(ExpectedColumn {
    name: "id",
    expected_min: i32_array([Some(0)]),
    expected_max: i32_array([Some(7299)]),
    })
    .with_column(ExpectedColumn {
    name: "bool_col",
    expected_min: bool_array([Some(false)]),
    expected_max: bool_array([Some(true)]),
    })
    .with_column(ExpectedColumn {
    name: "tinyint_col",
    expected_min: i8_array([Some(0)]),
    expected_max: i8_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "smallint_col",
    expected_min: i16_array([Some(0)]),
    expected_max: i16_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "int_col",
    expected_min: i32_array([Some(0)]),
    expected_max: i32_array([Some(9)]),
    })
    .with_column(ExpectedColumn {
    name: "bigint_col",
    expected_min: i64_array([Some(0)]),
    expected_max: i64_array([Some(90)]),
    })
    .with_column(ExpectedColumn {
    name: "float_col",
    expected_min: f32_array([Some(0.0)]),
    expected_max: f32_array([Some(9.9)]),
    })
    .with_column(ExpectedColumn {
    name: "double_col",
    expected_min: f64_array([Some(0.0)]),
    expected_max: f64_array([Some(90.89999999999999)]),
    })
    .with_column(ExpectedColumn {
    name: "date_string_col",
    expected_min: utf8_array([Some("01/01/09")]),
    expected_max: utf8_array([Some("12/31/10")]),
    })
    .with_column(ExpectedColumn {
    name: "string_col",
    expected_min: utf8_array([Some("0")]),
    expected_max: utf8_array([Some("9")]),
    })
    // File has no min/max for timestamp_col
    .with_column(ExpectedColumn {
    name: "timestamp_col",
    expected_min: timestamp_nanoseconds_array([None], None),
    expected_max: timestamp_nanoseconds_array([None], None),
    })
    .with_column(ExpectedColumn {
    name: "year",
    expected_min: i32_array([Some(2009)]),
    expected_max: i32_array([Some(2010)]),
    })
    .with_column(ExpectedColumn {
    name: "month",
    expected_min: i32_array([Some(1)]),
    expected_max: i32_array([Some(12)]),
    })
    .run();
    }
  • Port fixed length decimal legacy:
    fn fixed_length_decimal_legacy() {
    // /parquet-testing/data/fixed_length_decimal_legacy.parquet
    // row_groups: 1
    // "value": FixedLenByteArray({min: Some(FixedLenByteArray(ByteArray { data: Some(ByteBufferPtr { data: b"\0\0\0\0\0\xc8" }) })), max: Some(FixedLenByteArray(ByteArray { data: "\0\0\0\0\t`" })), distinct_count: None, null_count: 0, min_max_deprecated: true, min_max_backwards_compatible: true})
    TestFile::new("fixed_length_decimal_legacy.parquet")
    .with_column(ExpectedColumn {
    name: "value",
    expected_min: Arc::new(
    Decimal128Array::from(vec![Some(200)])
    .with_precision_and_scale(13, 2)
    .unwrap(),
    ),
    expected_max: Arc::new(
    Decimal128Array::from(vec![Some(2400)])
    .with_precision_and_scale(13, 2)
    .unwrap(),
    ),
    })
    .run();
    }

Additional context
This duplication is a historical artifact of how this code was developed in DataFusion and then it got brought over when @efredine ported the work in #6046

@alamb alamb added parquet Changes to the parquet crate enhancement Any new improvement worthy of a entry in the changelog labels Aug 2, 2024
@Kev1n8
Copy link
Contributor

Kev1n8 commented Aug 3, 2024

take

This issue was closed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement Any new improvement worthy of a entry in the changelog parquet Changes to the parquet crate
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants