Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions parquet/src/column/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,128 @@ mod tests {
);
}

#[test]
fn test_bool_statistics() {
let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]);
assert!(stats.has_min_max_set());
// should it be BooleanStatistics??
// https://github.com/apache/arrow-rs/issues/659
if let Statistics::Int32(stats) = stats {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note 1: it is strange that a Boolean column produces Int32Stats (and not BooleanStats)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #659 to track

assert_eq!(stats.min(), &0);
assert_eq!(stats.max(), &1);
} else {
panic!("expecting Statistics::Int32, got {:?}", stats);
}
}

#[test]
fn test_int32_statistics() {
let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
assert!(stats.has_min_max_set());
if let Statistics::Int32(stats) = stats {
assert_eq!(stats.min(), &-2);
assert_eq!(stats.max(), &3);
} else {
panic!("expecting Statistics::Int32, got {:?}", stats);
}
}

#[test]
fn test_int64_statistics() {
let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
assert!(stats.has_min_max_set());
if let Statistics::Int64(stats) = stats {
assert_eq!(stats.min(), &-2);
assert_eq!(stats.max(), &3);
} else {
panic!("expecting Statistics::Int64, got {:?}", stats);
}
}

#[test]
fn test_int96_statistics() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed Int96 stats test

let input = vec![
Int96::from(vec![1, 20, 30]),
Int96::from(vec![3, 20, 10]),
Int96::from(vec![0, 20, 30]),
Int96::from(vec![2, 20, 30]),
]
.into_iter()
.collect::<Vec<Int96>>();

let stats = statistics_roundtrip::<Int96Type>(&input);
assert!(stats.has_min_max_set());
if let Statistics::Int96(stats) = stats {
assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30]));
assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10]));
} else {
panic!("expecting Statistics::Int96, got {:?}", stats);
}
}

#[test]
fn test_float_statistics() {
let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
assert!(stats.has_min_max_set());
if let Statistics::Float(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &3.0);
} else {
panic!("expecting Statistics::Float, got {:?}", stats);
}
}

#[test]
fn test_double_statistics() {
let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]);
assert!(stats.has_min_max_set());
if let Statistics::Double(stats) = stats {
assert_eq!(stats.min(), &-2.0);
assert_eq!(stats.max(), &3.0);
} else {
panic!("expecting Statistics::Double, got {:?}", stats);
}
}

#[test]
fn test_byte_array_statistics() {
let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
.iter()
.map(|&s| s.into())
.collect::<Vec<ByteArray>>();

let stats = statistics_roundtrip::<ByteArrayType>(&input);
assert!(stats.has_min_max_set());
if let Statistics::ByteArray(stats) = stats {
assert_eq!(stats.min(), &ByteArray::from("aaw"));
assert_eq!(stats.max(), &ByteArray::from("zz"));
} else {
panic!("expecting Statistics::ByteArray, got {:?}", stats);
}
}

#[test]
fn test_fixed_len_byte_array_statistics() {
let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "]
.iter()
.map(|&s| {
let b: ByteArray = s.into();
b.into()
})
.collect::<Vec<FixedLenByteArray>>();

let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
assert!(stats.has_min_max_set());
// should it be FixedLenByteArray?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note 2: it is strange that a FixedLenByteArray column produces ByteArrayStats (and not FixedLenByteArrayStats)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #660

// https://github.com/apache/arrow-rs/issues/660
if let Statistics::ByteArray(stats) = stats {
assert_eq!(stats.min(), &ByteArray::from("aaw "));
assert_eq!(stats.max(), &ByteArray::from("zz "));
} else {
panic!("expecting Statistics::ByteArray, got {:?}", stats);
}
}

#[test]
fn test_float_statistics_nan_middle() {
let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);
Expand Down
29 changes: 12 additions & 17 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray {

impl PartialOrd for ByteArray {
fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
if self.data.is_some() && other.data.is_some() {
match self.len().cmp(&other.len()) {
Ordering::Greater => Some(Ordering::Greater),
Ordering::Less => Some(Ordering::Less),
Ordering::Equal => {
for (v1, v2) in self.data().iter().zip(other.data().iter()) {
match v1.cmp(v2) {
Ordering::Greater => return Some(Ordering::Greater),
Ordering::Less => return Some(Ordering::Less),
_ => {}
}
}
Some(Ordering::Equal)
}
// sort nulls first (consistent with PartialCmp on Option)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous implementation of PartialOrd seems to have been introduced in 48c3771 in apache/arrow#7622 by

It would be great if @zeevm or @sunchao had any additional context or comment to share.

Copy link
Member

@sunchao sunchao Aug 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! the original PR is apache/arrow#7586 but yea we missed this in the review and I think the existing logic is incorrect.

//
// Since ByteBuffer doesn't implement PartialOrd, so can't
// derive an implementation
match (&self.data, &other.data) {
(None, None) => Some(Ordering::Equal),
(None, Some(_)) => Some(Ordering::Less),
(Some(_), None) => Some(Ordering::Greater),
(Some(self_data), Some(other_data)) => {
// compare slices directly
self_data.data().partial_cmp(other_data.data())
}
} else {
None
}
}
}
Expand Down Expand Up @@ -1368,7 +1363,7 @@ mod tests {
let ba4 = ByteArray::from(vec![]);
let ba5 = ByteArray::from(vec![2, 2, 3]);

assert!(ba1 > ba2);
assert!(ba1 < ba2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

assert!(ba3 > ba1);
assert!(ba1 > ba4);
assert_eq!(ba1, ba11);
Expand Down