From 3274461b43f4570d3b122a88e977c2488560815b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 8 Aug 2021 03:46:14 -0400 Subject: [PATCH] Fix parquet string statistics generation (#643) * Fix string statistics generation, add tests * fix Int96 stats test * Add notes for additional tickets --- parquet/src/column/writer.rs | 122 +++++++++++++++++++++++++++++++++++ parquet/src/data_type.rs | 29 ++++----- 2 files changed, 134 insertions(+), 17 deletions(-) diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index d5b845756da2..3cb17e17f7f6 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -1687,6 +1687,128 @@ mod tests { ); } + #[test] + fn test_bool_statistics() { + let stats = statistics_roundtrip::(&[true, false, false, true]); + assert!(stats.has_min_max_set()); + // should it be BooleanStatistics?? + // https://github.com/apache/arrow-rs/issues/659 + if let Statistics::Int32(stats) = stats { + assert_eq!(stats.min(), &0); + assert_eq!(stats.max(), &1); + } else { + panic!("expecting Statistics::Int32, got {:?}", stats); + } + } + + #[test] + fn test_int32_statistics() { + let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); + assert!(stats.has_min_max_set()); + if let Statistics::Int32(stats) = stats { + assert_eq!(stats.min(), &-2); + assert_eq!(stats.max(), &3); + } else { + panic!("expecting Statistics::Int32, got {:?}", stats); + } + } + + #[test] + fn test_int64_statistics() { + let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); + assert!(stats.has_min_max_set()); + if let Statistics::Int64(stats) = stats { + assert_eq!(stats.min(), &-2); + assert_eq!(stats.max(), &3); + } else { + panic!("expecting Statistics::Int64, got {:?}", stats); + } + } + + #[test] + fn test_int96_statistics() { + let input = vec![ + Int96::from(vec![1, 20, 30]), + Int96::from(vec![3, 20, 10]), + Int96::from(vec![0, 20, 30]), + Int96::from(vec![2, 20, 30]), + ] + .into_iter() + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + if let Statistics::Int96(stats) = stats { + assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); + } else { + panic!("expecting Statistics::Int96, got {:?}", stats); + } + } + + #[test] + fn test_float_statistics() { + let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &3.0); + } else { + panic!("expecting Statistics::Float, got {:?}", stats); + } + } + + #[test] + fn test_double_statistics() { + let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &3.0); + } else { + panic!("expecting Statistics::Double, got {:?}", stats); + } + } + + #[test] + fn test_byte_array_statistics() { + let input = vec!["aawaa", "zz", "aaw", "m", "qrs"] + .iter() + .map(|&s| s.into()) + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + if let Statistics::ByteArray(stats) = stats { + assert_eq!(stats.min(), &ByteArray::from("aaw")); + assert_eq!(stats.max(), &ByteArray::from("zz")); + } else { + panic!("expecting Statistics::ByteArray, got {:?}", stats); + } + } + + #[test] + fn test_fixed_len_byte_array_statistics() { + let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "] + .iter() + .map(|&s| { + let b: ByteArray = s.into(); + b.into() + }) + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + // should it be FixedLenByteArray? + // https://github.com/apache/arrow-rs/issues/660 + if let Statistics::ByteArray(stats) = stats { + assert_eq!(stats.min(), &ByteArray::from("aaw ")); + assert_eq!(stats.max(), &ByteArray::from("zz ")); + } else { + panic!("expecting Statistics::ByteArray, got {:?}", stats); + } + } + #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 797324e421e1..127ba95387e3 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray { impl PartialOrd for ByteArray { fn partial_cmp(&self, other: &ByteArray) -> Option { - if self.data.is_some() && other.data.is_some() { - match self.len().cmp(&other.len()) { - Ordering::Greater => Some(Ordering::Greater), - Ordering::Less => Some(Ordering::Less), - Ordering::Equal => { - for (v1, v2) in self.data().iter().zip(other.data().iter()) { - match v1.cmp(v2) { - Ordering::Greater => return Some(Ordering::Greater), - Ordering::Less => return Some(Ordering::Less), - _ => {} - } - } - Some(Ordering::Equal) - } + // sort nulls first (consistent with PartialCmp on Option) + // + // Since ByteBuffer doesn't implement PartialOrd, so can't + // derive an implementation + match (&self.data, &other.data) { + (None, None) => Some(Ordering::Equal), + (None, Some(_)) => Some(Ordering::Less), + (Some(_), None) => Some(Ordering::Greater), + (Some(self_data), Some(other_data)) => { + // compare slices directly + self_data.data().partial_cmp(other_data.data()) } - } else { - None } } } @@ -1368,7 +1363,7 @@ mod tests { let ba4 = ByteArray::from(vec![]); let ba5 = ByteArray::from(vec![2, 2, 3]); - assert!(ba1 > ba2); + assert!(ba1 < ba2); assert!(ba3 > ba1); assert!(ba1 > ba4); assert_eq!(ba1, ba11);