-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Fix parquet string statistics generation #643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1687,6 +1687,128 @@ mod tests { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_bool_statistics() { | ||
| let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]); | ||
| assert!(stats.has_min_max_set()); | ||
| // should it be BooleanStatistics?? | ||
| // https://github.com/apache/arrow-rs/issues/659 | ||
| if let Statistics::Int32(stats) = stats { | ||
| assert_eq!(stats.min(), &0); | ||
| assert_eq!(stats.max(), &1); | ||
| } else { | ||
| panic!("expecting Statistics::Int32, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_int32_statistics() { | ||
| let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::Int32(stats) = stats { | ||
| assert_eq!(stats.min(), &-2); | ||
| assert_eq!(stats.max(), &3); | ||
| } else { | ||
| panic!("expecting Statistics::Int32, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_int64_statistics() { | ||
| let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::Int64(stats) = stats { | ||
| assert_eq!(stats.min(), &-2); | ||
| assert_eq!(stats.max(), &3); | ||
| } else { | ||
| panic!("expecting Statistics::Int64, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_int96_statistics() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed Int96 stats test |
||
| let input = vec![ | ||
| Int96::from(vec![1, 20, 30]), | ||
| Int96::from(vec![3, 20, 10]), | ||
| Int96::from(vec![0, 20, 30]), | ||
| Int96::from(vec![2, 20, 30]), | ||
| ] | ||
| .into_iter() | ||
| .collect::<Vec<Int96>>(); | ||
|
|
||
| let stats = statistics_roundtrip::<Int96Type>(&input); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::Int96(stats) = stats { | ||
| assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); | ||
| assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); | ||
| } else { | ||
| panic!("expecting Statistics::Int96, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_float_statistics() { | ||
| let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::Float(stats) = stats { | ||
| assert_eq!(stats.min(), &-2.0); | ||
| assert_eq!(stats.max(), &3.0); | ||
| } else { | ||
| panic!("expecting Statistics::Float, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_double_statistics() { | ||
| let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::Double(stats) = stats { | ||
| assert_eq!(stats.min(), &-2.0); | ||
| assert_eq!(stats.max(), &3.0); | ||
| } else { | ||
| panic!("expecting Statistics::Double, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_byte_array_statistics() { | ||
| let input = vec!["aawaa", "zz", "aaw", "m", "qrs"] | ||
| .iter() | ||
| .map(|&s| s.into()) | ||
| .collect::<Vec<ByteArray>>(); | ||
|
|
||
| let stats = statistics_roundtrip::<ByteArrayType>(&input); | ||
| assert!(stats.has_min_max_set()); | ||
| if let Statistics::ByteArray(stats) = stats { | ||
| assert_eq!(stats.min(), &ByteArray::from("aaw")); | ||
| assert_eq!(stats.max(), &ByteArray::from("zz")); | ||
| } else { | ||
| panic!("expecting Statistics::ByteArray, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_fixed_len_byte_array_statistics() { | ||
| let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "] | ||
| .iter() | ||
| .map(|&s| { | ||
| let b: ByteArray = s.into(); | ||
| b.into() | ||
| }) | ||
| .collect::<Vec<FixedLenByteArray>>(); | ||
|
|
||
| let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input); | ||
| assert!(stats.has_min_max_set()); | ||
| // should it be FixedLenByteArray? | ||
|
||
| // https://github.com/apache/arrow-rs/issues/660 | ||
| if let Statistics::ByteArray(stats) = stats { | ||
| assert_eq!(stats.min(), &ByteArray::from("aaw ")); | ||
| assert_eq!(stats.max(), &ByteArray::from("zz ")); | ||
| } else { | ||
| panic!("expecting Statistics::ByteArray, got {:?}", stats); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_float_statistics_nan_middle() { | ||
| let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray { | |
|
|
||
| impl PartialOrd for ByteArray { | ||
| fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> { | ||
| if self.data.is_some() && other.data.is_some() { | ||
| match self.len().cmp(&other.len()) { | ||
| Ordering::Greater => Some(Ordering::Greater), | ||
| Ordering::Less => Some(Ordering::Less), | ||
| Ordering::Equal => { | ||
| for (v1, v2) in self.data().iter().zip(other.data().iter()) { | ||
| match v1.cmp(v2) { | ||
| Ordering::Greater => return Some(Ordering::Greater), | ||
| Ordering::Less => return Some(Ordering::Less), | ||
| _ => {} | ||
| } | ||
| } | ||
| Some(Ordering::Equal) | ||
| } | ||
| // sort nulls first (consistent with PartialCmp on Option) | ||
|
||
| // | ||
| // Since ByteBuffer doesn't implement PartialOrd, so can't | ||
| // derive an implementation | ||
| match (&self.data, &other.data) { | ||
| (None, None) => Some(Ordering::Equal), | ||
| (None, Some(_)) => Some(Ordering::Less), | ||
| (Some(_), None) => Some(Ordering::Greater), | ||
| (Some(self_data), Some(other_data)) => { | ||
| // compare slices directly | ||
| self_data.data().partial_cmp(other_data.data()) | ||
| } | ||
| } else { | ||
| None | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -1368,7 +1363,7 @@ mod tests { | |
| let ba4 = ByteArray::from(vec![]); | ||
| let ba5 = ByteArray::from(vec![2, 2, 3]); | ||
|
|
||
| assert!(ba1 > ba2); | ||
| assert!(ba1 < ba2); | ||
|
||
| assert!(ba3 > ba1); | ||
| assert!(ba1 > ba4); | ||
| assert_eq!(ba1, ba11); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note 1: it is strange that a
Booleancolumn producesInt32Stats(and notBooleanStats)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Filed #659 to track