From 4e647ad5a39ecf5341930e71503135aa29b2e98e Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 00:12:47 +0800 Subject: [PATCH 1/7] impl sort fixed binary array Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/sort.rs | 106 +++++++++++++++++++++++++++++- arrow/src/compute/kernels/take.rs | 40 +++++++++++ arrow/src/compute/util.rs | 34 ++++++++++ 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 008661ec0104..b55f3c157633 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -383,6 +383,7 @@ pub fn sort_to_indices( } } } + DataType::FixedSizeBinary(_) => sort_binary(values, v, n, &options, limit), t => { return Err(ArrowError::ComputeError(format!( "Sort not supported for data type {:?}", @@ -764,6 +765,56 @@ where } } +fn sort_binary( + values: &ArrayRef, + value_indices: Vec, + mut null_indices: Vec, + options: &SortOptions, + limit: Option, +) -> UInt32Array { + let values = values + .as_any() + .downcast_ref::() + .unwrap(); + let mut valids: Vec<(u32, &[u8])> = value_indices + .iter() + .copied() + .map(|index| (index, values.value(index as usize))) + .collect(); + + let mut len = values.len(); + let descending = options.descending; + let nulls_len = null_indices.len(); + + if let Some(limit) = limit { + len = limit.min(len); + } + + if !descending { + sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { + a.1.cmp(b.1) + }); + } else { + sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { + a.1.cmp(b.1).reverse() + }); + null_indices.reverse(); + } + + println!("valids after sort: {:?}", valids); + + let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); + if options.nulls_first { + null_indices.append(&mut valid_indices); + null_indices.truncate(len); + UInt32Array::from(null_indices) + } else { + valid_indices.append(&mut null_indices); + valid_indices.truncate(len); + UInt32Array::from(valid_indices) + } +} + /// Compare two `Array`s based on the ordering defined in [ord](crate::array::ord). fn cmp_array(a: &dyn Array, b: &dyn Array) -> Ordering { let cmp_op = build_compare(a, b).unwrap(); @@ -975,7 +1026,8 @@ impl LexicographicalComparator<'_> { mod tests { use super::*; use crate::compute::util::tests::{ - build_fixed_size_list_nullable, build_generic_list_nullable, + build_fixed_size_binary_nullable, build_fixed_size_list_nullable, + build_generic_list_nullable, }; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; @@ -1183,6 +1235,32 @@ mod tests { } } + fn test_sort_binary_arrays( + data: Vec>>, + options: Option, + limit: Option, + expected_data: Vec>>, + fixed_length: Option, + ) { + if let Some(length) = fixed_length { + let input = Arc::new(build_fixed_size_binary_nullable(data.clone(), length)); + let sorted = match limit { + Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), + None => sort(&(input as ArrayRef), options).unwrap(), + }; + + let expected = Arc::new(build_fixed_size_binary_nullable( + expected_data.clone(), + length, + )) as ArrayRef; + + println!("array after sort: {:?}", sorted); + println!("expected array: {:?}", expected); + + assert_eq!(&sorted, &expected); + } + } + #[test] fn test_sort_to_indices_primitives() { test_sort_to_indices_primitive_arrays::( @@ -2382,6 +2460,32 @@ mod tests { ); } + #[test] + fn test_sort_binary() { + test_sort_binary_arrays( + vec![ + Some(vec![0, 0, 0]), + Some(vec![0, 0, 5]), + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + Some(vec![0, 0, 1]), + ], + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![ + Some(vec![0, 0, 0]), + Some(vec![0, 0, 1]), + Some(vec![0, 0, 3]), + Some(vec![0, 0, 5]), + Some(vec![0, 0, 7]), + ], + Some(3), + ); + } + #[test] fn test_lex_sort_single_column() { let input = vec![SortColumn { diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index f04208a689e6..4e72fe95af22 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -259,6 +259,16 @@ where DataType::UInt64 => downcast_dict_take!(UInt64Type, values, indices), t => unimplemented!("Take not supported for dictionary key type {:?}", t), }, + DataType::FixedSizeBinary(_) => { + let values = values + .as_any() + .downcast_ref::() + .unwrap(); + Ok(Arc::new(take_fixed_size_binary( + values, indices, + // *length as u32, + )?)) + } t => unimplemented!("Take not supported for data type {:?}", t), } } @@ -760,6 +770,36 @@ where Ok(FixedSizeListArray::from(list_data)) } +fn take_fixed_size_binary( + values: &FixedSizeBinaryArray, + indices: &PrimitiveArray, + // length: ::Native, +) -> Result +where + IndexType: ArrowNumericType, + IndexType::Native: ToPrimitive, +{ + println!("going to take array: {:?}", values); + println!("with indices: {:?}", indices); + + let data_ref = values.data_ref(); + let array_iter = indices + .values() + .iter() + .map(|idx| { + let idx = maybe_usize::(*idx)?; + if data_ref.is_valid(idx as usize) { + Ok(Some(values.value(idx as usize))) + } else { + Ok(None) + } + }) + .collect::>>()? + .into_iter(); + + FixedSizeBinaryArray::try_from_sparse_iter(array_iter) +} + /// `take` implementation for dictionary arrays /// /// applies `take` to the keys of the dictionary array and returns a new dictionary array diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 56de59483016..3cc7ee551fc7 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -402,6 +402,40 @@ pub(super) mod tests { FixedSizeListArray::from(list_data) } + pub(crate) fn build_fixed_size_binary_nullable( + binary_values: Vec>>, + length: ::Native, + ) -> FixedSizeBinaryArray { + let mut values = MutableBuffer::from_len_zeroed(0); + let mut binary_null_count = 0; + let array_len = binary_values.len(); + + let num_bytes = bit_util::ceil(array_len, 8); + let mut array_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); + for (idx, element) in binary_values.into_iter().enumerate() { + if let Some(items) = element { + debug_assert_eq!(length as usize, items.len()); + + values.extend(items.into_iter()); + } else { + binary_null_count += 1; + bit_util::unset_bit(&mut array_bitmap.as_slice_mut(), idx); + values.extend(vec![0; length as usize].into_iter()); + } + } + + ArrayData::new( + DataType::FixedSizeBinary(length), + array_len, + Some(binary_null_count), + Some(array_bitmap.into()), + 0, + vec![values.into()], + vec![], + ) + .into() + } + #[test] fn test_take_value_index_from_list() { let list = build_generic_list::(vec![ From e009e30cc117a26e7eac719cf09d2801820cf8b0 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 00:41:25 +0800 Subject: [PATCH 2/7] remove array builder util, add test cases Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/sort.rs | 101 +++++++++++++++++++++++++++--- arrow/src/compute/kernels/take.rs | 6 +- arrow/src/compute/util.rs | 34 ---------- 3 files changed, 93 insertions(+), 48 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index b55f3c157633..5e764bcd4c59 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -772,6 +772,9 @@ fn sort_binary( options: &SortOptions, limit: Option, ) -> UInt32Array { + println!("values: {:?}", values); + println!("indices: {:?}", value_indices); + let values = values .as_any() .downcast_ref::() @@ -782,6 +785,8 @@ fn sort_binary( .map(|index| (index, values.value(index as usize))) .collect(); + println!("valids: {:?}", valids); + let mut len = values.len(); let descending = options.descending; let nulls_len = null_indices.len(); @@ -1026,8 +1031,7 @@ impl LexicographicalComparator<'_> { mod tests { use super::*; use crate::compute::util::tests::{ - build_fixed_size_binary_nullable, build_fixed_size_list_nullable, - build_generic_list_nullable, + build_fixed_size_list_nullable, build_generic_list_nullable, }; use rand::rngs::StdRng; use rand::{Rng, RngCore, SeedableRng}; @@ -1242,17 +1246,18 @@ mod tests { expected_data: Vec>>, fixed_length: Option, ) { - if let Some(length) = fixed_length { - let input = Arc::new(build_fixed_size_binary_nullable(data.clone(), length)); + if let Some(_) = fixed_length { + let input = Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(), + ); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), None => sort(&(input as ArrayRef), options).unwrap(), }; - - let expected = Arc::new(build_fixed_size_binary_nullable( - expected_data.clone(), - length, - )) as ArrayRef; + let expected = Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter(expected_data.into_iter()) + .unwrap(), + ) as ArrayRef; println!("array after sort: {:?}", sorted); println!("expected array: {:?}", expected); @@ -2484,6 +2489,84 @@ mod tests { ], Some(3), ); + + // with nulls + test_sort_binary_arrays( + vec![ + Some(vec![0, 0, 0]), + None, + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + Some(vec![0, 0, 1]), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![ + Some(vec![0, 0, 0]), + Some(vec![0, 0, 1]), + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + None, + None, + ], + Some(3), + ); + + // descending + test_sort_binary_arrays( + vec![ + Some(vec![0, 0, 0]), + None, + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + Some(vec![0, 0, 1]), + None, + ], + Some(SortOptions { + descending: true, + nulls_first: false, + }), + None, + vec![ + Some(vec![0, 0, 7]), + Some(vec![0, 0, 3]), + Some(vec![0, 0, 1]), + Some(vec![0, 0, 0]), + None, + None, + ], + Some(3), + ); + + // nulls first + test_sort_binary_arrays( + vec![ + Some(vec![0, 0, 0]), + None, + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + Some(vec![0, 0, 1]), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + None, + vec![ + None, + None, + Some(vec![0, 0, 0]), + Some(vec![0, 0, 1]), + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + ], + Some(3), + ); } #[test] diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 4e72fe95af22..53759d358e22 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -264,10 +264,7 @@ where .as_any() .downcast_ref::() .unwrap(); - Ok(Arc::new(take_fixed_size_binary( - values, indices, - // *length as u32, - )?)) + Ok(Arc::new(take_fixed_size_binary(values, indices)?)) } t => unimplemented!("Take not supported for data type {:?}", t), } @@ -773,7 +770,6 @@ where fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, - // length: ::Native, ) -> Result where IndexType: ArrowNumericType, diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 3cc7ee551fc7..56de59483016 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -402,40 +402,6 @@ pub(super) mod tests { FixedSizeListArray::from(list_data) } - pub(crate) fn build_fixed_size_binary_nullable( - binary_values: Vec>>, - length: ::Native, - ) -> FixedSizeBinaryArray { - let mut values = MutableBuffer::from_len_zeroed(0); - let mut binary_null_count = 0; - let array_len = binary_values.len(); - - let num_bytes = bit_util::ceil(array_len, 8); - let mut array_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - for (idx, element) in binary_values.into_iter().enumerate() { - if let Some(items) = element { - debug_assert_eq!(length as usize, items.len()); - - values.extend(items.into_iter()); - } else { - binary_null_count += 1; - bit_util::unset_bit(&mut array_bitmap.as_slice_mut(), idx); - values.extend(vec![0; length as usize].into_iter()); - } - } - - ArrayData::new( - DataType::FixedSizeBinary(length), - array_len, - Some(binary_null_count), - Some(array_bitmap.into()), - 0, - vec![values.into()], - vec![], - ) - .into() - } - #[test] fn test_take_value_index_from_list() { let list = build_generic_list::(vec![ From 0ce4f890ef225f387a7e0950bcb8a57ce5cbc649 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 13:57:14 +0800 Subject: [PATCH 3/7] impl sort for generic binary array Signed-off-by: Ruihang Xia --- arrow/src/array/cast.rs | 10 +++ arrow/src/array/mod.rs | 6 +- arrow/src/compute/kernels/sort.rs | 143 +++++++++++++++++++++++++++--- arrow/src/compute/kernels/take.rs | 46 +++++++++- 4 files changed, 186 insertions(+), 19 deletions(-) diff --git a/arrow/src/array/cast.rs b/arrow/src/array/cast.rs index 0477f2831f9a..dfc15608555f 100644 --- a/arrow/src/array/cast.rs +++ b/arrow/src/array/cast.rs @@ -59,6 +59,16 @@ pub fn as_large_list_array(arr: &ArrayRef) -> &LargeListArray { as_generic_list_array::(arr) } +#[doc = "Force downcast ArrayRef to GenericBinaryArray"] +#[inline] +pub fn as_generic_binary_array( + arr: &ArrayRef, +) -> &GenericBinaryArray { + arr.as_any() + .downcast_ref::>() + .expect("Unable to downcast to binary array") +} + macro_rules! array_downcast_fn { ($name: ident, $arrty: ty, $arrty_str:expr) => { #[doc = "Force downcast ArrayRef to "] diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 6a0b94a55515..69b65f41ad5a 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -273,9 +273,9 @@ pub use self::ord::{build_compare, DynComparator}; // --------------------- Array downcast helper functions --------------------- pub use self::cast::{ - as_boolean_array, as_dictionary_array, as_generic_list_array, as_large_list_array, - as_largestring_array, as_list_array, as_null_array, as_primitive_array, - as_string_array, as_struct_array, + as_boolean_array, as_dictionary_array, as_generic_binary_array, + as_generic_list_array, as_large_list_array, as_largestring_array, as_list_array, + as_null_array, as_primitive_array, as_string_array, as_struct_array, }; // ------------------------------ C Data Interface --------------------------- diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 5e764bcd4c59..3de16241e40b 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -383,7 +383,10 @@ pub fn sort_to_indices( } } } - DataType::FixedSizeBinary(_) => sort_binary(values, v, n, &options, limit), + DataType::Binary | DataType::FixedSizeBinary(_) => { + sort_binary::(values, v, n, &options, limit) + } + DataType::LargeBinary => sort_binary::(values, v, n, &options, limit), t => { return Err(ArrowError::ComputeError(format!( "Sort not supported for data type {:?}", @@ -765,25 +768,39 @@ where } } -fn sort_binary( +fn sort_binary( values: &ArrayRef, value_indices: Vec, mut null_indices: Vec, options: &SortOptions, limit: Option, -) -> UInt32Array { +) -> UInt32Array +where + S: BinaryOffsetSizeTrait, +{ println!("values: {:?}", values); println!("indices: {:?}", value_indices); - let values = values + let mut valids: Vec<(u32, &[u8])> = values .as_any() .downcast_ref::() - .unwrap(); - let mut valids: Vec<(u32, &[u8])> = value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect(); + .map_or_else( + || { + let values = as_generic_binary_array::(values); + value_indices + .iter() + .copied() + .map(|index| (index, values.value(index as usize))) + .collect() + }, + |values| { + value_indices + .iter() + .copied() + .map(|index| (index, values.value(index as usize))) + .collect() + }, + ); println!("valids: {:?}", valids); @@ -794,7 +811,6 @@ fn sort_binary( if let Some(limit) = limit { len = limit.min(len); } - if !descending { sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { a.1.cmp(b.1) @@ -1246,16 +1262,17 @@ mod tests { expected_data: Vec>>, fixed_length: Option, ) { - if let Some(_) = fixed_length { + // Fixed size binary array + if fixed_length.is_some() { let input = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(), + FixedSizeBinaryArray::try_from_sparse_iter(data.iter().cloned()).unwrap(), ); let sorted = match limit { Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), None => sort(&(input as ArrayRef), options).unwrap(), }; let expected = Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter(expected_data.into_iter()) + FixedSizeBinaryArray::try_from_sparse_iter(expected_data.iter().cloned()) .unwrap(), ) as ArrayRef; @@ -1264,6 +1281,35 @@ mod tests { assert_eq!(&sorted, &expected); } + + // Generic size binary array + fn make_generic_binary_array( + data: &[Option>], + ) -> Arc> { + Arc::new(GenericBinaryArray::::from_opt_vec( + data.iter() + .map(|binary| binary.as_ref().map(Vec::as_slice)) + .collect(), + )) + } + + // BinaryArray + let input = make_generic_binary_array::(&data); + let sorted = match limit { + Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), + None => sort(&(input as ArrayRef), options).unwrap(), + }; + let expected = make_generic_binary_array::(&expected_data) as ArrayRef; + assert_eq!(&sorted, &expected); + + // LargeBinaryArray + let input = make_generic_binary_array::(&data); + let sorted = match limit { + Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), + None => sort(&(input as ArrayRef), options).unwrap(), + }; + let expected = make_generic_binary_array::(&expected_data) as ArrayRef; + assert_eq!(&sorted, &expected); } #[test] @@ -2567,6 +2613,75 @@ mod tests { ], Some(3), ); + + // limit + test_sort_binary_arrays( + vec![ + Some(vec![0, 0, 0]), + None, + Some(vec![0, 0, 3]), + Some(vec![0, 0, 7]), + Some(vec![0, 0, 1]), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(4), + vec![None, None, Some(vec![0, 0, 0]), Some(vec![0, 0, 1])], + Some(3), + ); + + // var length + test_sort_binary_arrays( + vec![ + Some(b"Hello".to_vec()), + None, + Some(b"from".to_vec()), + Some(b"Apache".to_vec()), + Some(b"Arrow-rs".to_vec()), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![ + Some(b"Apache".to_vec()), + Some(b"Arrow-rs".to_vec()), + Some(b"Hello".to_vec()), + Some(b"from".to_vec()), + None, + None, + ], + None, + ); + + // limit + test_sort_binary_arrays( + vec![ + Some(b"Hello".to_vec()), + None, + Some(b"from".to_vec()), + Some(b"Apache".to_vec()), + Some(b"Arrow-rs".to_vec()), + None, + ], + Some(SortOptions { + descending: false, + nulls_first: true, + }), + Some(4), + vec![ + None, + None, + Some(b"Apache".to_vec()), + Some(b"Arrow-rs".to_vec()), + ], + None, + ); } #[test] diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 53759d358e22..6a15883957e1 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -17,6 +17,7 @@ //! Defines take kernel for [Array] +use std::iter::FromIterator; use std::{ops::AddAssign, sync::Arc}; use crate::buffer::{Buffer, MutableBuffer}; @@ -259,6 +260,20 @@ where DataType::UInt64 => downcast_dict_take!(UInt64Type, values, indices), t => unimplemented!("Take not supported for dictionary key type {:?}", t), }, + DataType::Binary => { + let values = values + .as_any() + .downcast_ref::>() + .unwrap(); + Ok(Arc::new(take_binary(values, indices)?)) + } + DataType::LargeBinary => { + let values = values + .as_any() + .downcast_ref::>() + .unwrap(); + Ok(Arc::new(take_binary(values, indices)?)) + } DataType::FixedSizeBinary(_) => { let values = values .as_any() @@ -767,6 +782,33 @@ where Ok(FixedSizeListArray::from(list_data)) } +fn take_binary( + values: &GenericBinaryArray, + indices: &PrimitiveArray, +) -> Result> +where + OffsetType: BinaryOffsetSizeTrait, + IndexType: ArrowNumericType, + IndexType::Native: ToPrimitive, +{ + let data_ref = values.data_ref(); + let array_iter = indices + .values() + .iter() + .map(|idx| { + let idx = maybe_usize::(*idx)?; + if data_ref.is_valid(idx) { + Ok(Some(values.value(idx))) + } else { + Ok(None) + } + }) + .collect::>>()? + .into_iter(); + + Ok(GenericBinaryArray::::from_iter(array_iter)) +} + fn take_fixed_size_binary( values: &FixedSizeBinaryArray, indices: &PrimitiveArray, @@ -784,8 +826,8 @@ where .iter() .map(|idx| { let idx = maybe_usize::(*idx)?; - if data_ref.is_valid(idx as usize) { - Ok(Some(values.value(idx as usize))) + if data_ref.is_valid(idx) { + Ok(Some(values.value(idx))) } else { Ok(None) } From 64b66a2cb196dc3990c89b9c3d80bc1804df1584 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 14:09:04 +0800 Subject: [PATCH 4/7] tidy Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/sort.rs | 10 ---------- arrow/src/compute/kernels/take.rs | 3 --- 2 files changed, 13 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 3de16241e40b..1a105e73fe2c 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -778,9 +778,6 @@ fn sort_binary( where S: BinaryOffsetSizeTrait, { - println!("values: {:?}", values); - println!("indices: {:?}", value_indices); - let mut valids: Vec<(u32, &[u8])> = values .as_any() .downcast_ref::() @@ -802,8 +799,6 @@ where }, ); - println!("valids: {:?}", valids); - let mut len = values.len(); let descending = options.descending; let nulls_len = null_indices.len(); @@ -822,8 +817,6 @@ where null_indices.reverse(); } - println!("valids after sort: {:?}", valids); - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); if options.nulls_first { null_indices.append(&mut valid_indices); @@ -1276,9 +1269,6 @@ mod tests { .unwrap(), ) as ArrayRef; - println!("array after sort: {:?}", sorted); - println!("expected array: {:?}", expected); - assert_eq!(&sorted, &expected); } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 6a15883957e1..c85a23d06e0c 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -817,9 +817,6 @@ where IndexType: ArrowNumericType, IndexType::Native: ToPrimitive, { - println!("going to take array: {:?}", values); - println!("with indices: {:?}", indices); - let data_ref = values.data_ref(); let array_iter = indices .values() From aabfec84d3834a7826698db5c2511abd6534599a Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 14:30:09 +0800 Subject: [PATCH 5/7] run clippy Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/take.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index c85a23d06e0c..75c8f766888f 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -17,7 +17,6 @@ //! Defines take kernel for [Array] -use std::iter::FromIterator; use std::{ops::AddAssign, sync::Arc}; use crate::buffer::{Buffer, MutableBuffer}; @@ -806,7 +805,7 @@ where .collect::>>()? .into_iter(); - Ok(GenericBinaryArray::::from_iter(array_iter)) + Ok(array_iter.collect::>()) } fn take_fixed_size_binary( From f35532d5b0bbe025d7d3817ad10bda985b559301 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sun, 25 Jul 2021 22:09:27 +0800 Subject: [PATCH 6/7] test: fixed binary with different prefix Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/sort.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 1a105e73fe2c..3617f9c10a5a 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -2552,6 +2552,31 @@ mod tests { Some(3), ); + test_sort_binary_arrays( + vec![ + Some(vec![3, 5, 7]), + None, + Some(vec![1, 7, 1]), + Some(vec![2, 7, 3]), + None, + Some(vec![1, 4, 3]), + ], + Some(SortOptions { + descending: false, + nulls_first: false, + }), + None, + vec![ + Some(vec![1, 4, 3]), + Some(vec![1, 7, 1]), + Some(vec![2, 7, 3]), + Some(vec![3, 5, 7]), + None, + None, + ], + Some(3), + ); + // descending test_sort_binary_arrays( vec![ From 5877badd0b60ab2d59fb599323bfcc6adf141c61 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sun, 25 Jul 2021 22:25:09 +0800 Subject: [PATCH 7/7] rebase master Signed-off-by: Ruihang Xia --- arrow/src/compute/kernels/sort.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 3617f9c10a5a..95afc14549e9 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -807,11 +807,11 @@ where len = limit.min(len); } if !descending { - sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { + sort_unstable_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { a.1.cmp(b.1) }); } else { - sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { + sort_unstable_by(&mut valids, len.saturating_sub(nulls_len), |a, b| { a.1.cmp(b.1).reverse() }); null_indices.reverse();