diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs index 2f0e9d4f112..e844853eae8 100644 --- a/rust/arrow/src/array/array.rs +++ b/rust/arrow/src/array/array.rs @@ -2115,28 +2115,32 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for StructArray { /// Example **with nullable** data: /// /// ``` -/// use arrow::array::DictionaryArray; +/// use arrow::array::{DictionaryArray, Int8Array}; /// use arrow::datatypes::Int8Type; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); -/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), None, Some(1)]); +/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)])); /// ``` /// /// Example **without nullable** data: /// /// ``` -/// use arrow::array::DictionaryArray; +/// use arrow::array::{DictionaryArray, Int8Array}; /// use arrow::datatypes::Int8Type; /// let test = vec!["a", "a", "b", "c"]; /// let array : DictionaryArray = test.into_iter().collect(); -/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), Some(1), Some(2)]); +/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); /// ``` pub struct DictionaryArray { - /// Array of keys, stored as a PrimitiveArray. + /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, + /// as, in the current implementation, `values` below are the first child of this struct. data: ArrayDataRef, - /// Pointer to the key values. - raw_values: RawPtrBox, + /// The keys of this dictionary. These are constructed from the buffer and null bitmap + /// of `data`. + /// Also, note that these do not correspond to the true values of this array. Rather, they map + /// to the real values. + keys: PrimitiveArray, /// Array of dictionary values (can by any DataType). values: ArrayRef, @@ -2145,112 +2149,10 @@ pub struct DictionaryArray { is_ordered: bool, } -#[derive(Debug)] -enum Draining { - Ready, - Iterating, - Finished, -} - -#[derive(Debug)] -pub struct NullableIter<'a, T> { - data: &'a ArrayDataRef, // TODO: Use a pointer to the null bitmap. - ptr: *const T, - i: usize, - len: usize, - draining: Draining, -} - -impl<'a, T> std::iter::Iterator for NullableIter<'a, T> -where - T: Clone, -{ - type Item = Option; - - fn next(&mut self) -> Option { - let i = self.i; - if i >= self.len { - None - } else if self.data.is_null(i) { - self.i += 1; - Some(None) - } else { - self.i += 1; - unsafe { Some(Some((&*self.ptr.add(i)).clone())) } - } - } - - fn size_hint(&self) -> (usize, Option) { - (self.len, Some(self.len)) - } - - fn nth(&mut self, n: usize) -> Option { - let i = self.i; - if i + n >= self.len { - self.i = self.len; - None - } else if self.data.is_null(i + n) { - self.i += n + 1; - Some(None) - } else { - self.i += n + 1; - unsafe { Some(Some((&*self.ptr.add(i + n)).clone())) } - } - } -} - -impl<'a, T> std::iter::DoubleEndedIterator for NullableIter<'a, T> -where - T: Clone, -{ - fn next_back(&mut self) -> Option { - match self.draining { - Draining::Ready => { - self.draining = Draining::Iterating; - self.i = self.len - 1; - self.next_back() - } - Draining::Iterating => { - let i = self.i; - if i >= self.len { - None - } else if self.data.is_null(i) { - self.i = self.i.checked_sub(1).unwrap_or_else(|| { - self.draining = Draining::Finished; - 0_usize - }); - Some(None) - } else { - match i.checked_sub(1) { - Some(idx) => { - self.i = idx; - unsafe { Some(Some((&*self.ptr.add(i)).clone())) } - } - _ => { - self.draining = Draining::Finished; - unsafe { Some(Some((&*self.ptr).clone())) } - } - } - } - } - Draining::Finished => { - self.draining = Draining::Ready; - None - } - } - } -} - impl<'a, K: ArrowPrimitiveType> DictionaryArray { /// Return an iterator to the keys of this dictionary. - pub fn keys(&self) -> NullableIter<'_, K::Native> { - NullableIter::<'_, K::Native> { - data: &self.data, - ptr: unsafe { self.raw_values.get().add(self.data.offset()) }, - i: 0, - len: self.data.len(), - draining: Draining::Ready, - } + pub fn keys(&self) -> &PrimitiveArray { + &self.keys } /// Returns an array view of the keys of this dictionary @@ -2291,12 +2193,12 @@ impl<'a, K: ArrowPrimitiveType> DictionaryArray { /// The length of the dictionary is the length of the keys array. pub fn len(&self) -> usize { - self.data.len() + self.keys.len() } /// Whether this dictionary is empty pub fn is_empty(&self) -> bool { - self.data.is_empty() + self.keys.is_empty() } // Currently exists for compatibility purposes with Arrow IPC. @@ -2319,13 +2221,24 @@ impl From for DictionaryArray { "DictionaryArray should contain a single child array (values)." ); - let raw_values = data.buffers()[0].raw_data(); - let dtype: &DataType = data.data_type(); - let values = make_array(data.child_data()[0].clone()); - if let DataType::Dictionary(_, _) = dtype { + if let DataType::Dictionary(key_data_type, _) = data.data_type() { + if key_data_type.as_ref() != &T::DATA_TYPE { + panic!("DictionaryArray's data type must match.") + }; + // create a zero-copy of the keys' data + let keys = PrimitiveArray::::from(Arc::new(ArrayData::new( + T::DATA_TYPE, + data.len(), + Some(data.null_count()), + data.null_buffer().cloned(), + data.offset(), + data.buffers().to_vec(), + vec![], + ))); + let values = make_array(data.child_data()[0].clone()); Self { data, - raw_values: RawPtrBox::new(raw_values as *const T::Native), + keys, values, is_ordered: false, } @@ -2396,32 +2309,25 @@ impl Array for DictionaryArray { &self.data } - /// Returns the total number of bytes of memory occupied by the buffers owned by this [DictionaryArray]. fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size() + // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`. + self.data.get_buffer_memory_size() } - /// Returns the total number of bytes of memory occupied physically by this [DictionaryArray]. fn get_array_memory_size(&self) -> usize { self.data.get_array_memory_size() - + self.values().get_array_memory_size() + + self.keys.get_array_memory_size() + + self.values.get_array_memory_size() + mem::size_of_val(self) } } impl fmt::Debug for DictionaryArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - const MAX_LEN: usize = 10; - let keys: Vec<_> = self.keys().take(MAX_LEN).collect(); - let elipsis = if self.keys().count() > MAX_LEN { - "..." - } else { - "" - }; writeln!( f, - "DictionaryArray {{keys: {:?}{} values: {:?}}}", - keys, elipsis, self.values + "DictionaryArray {{keys: {:?} values: {:?}}}", + self.keys, self.values ) } } @@ -3084,23 +2990,7 @@ mod tests { // Null count only makes sense in terms of the component arrays. assert_eq!(0, dict_array.null_count()); assert_eq!(0, dict_array.values().null_count()); - assert_eq!(Some(Some(3)), dict_array.keys().nth(1)); - assert_eq!(Some(Some(4)), dict_array.keys().nth(2)); - - assert_eq!( - dict_array.keys().collect::>>(), - vec![Some(2), Some(3), Some(4)] - ); - - assert_eq!( - dict_array.keys().rev().collect::>>(), - vec![Some(4), Some(3), Some(2)] - ); - - assert_eq!( - dict_array.keys().rev().rev().collect::>>(), - vec![Some(2), Some(3), Some(4)] - ); + assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4])); // Now test with a non-zero offset let dict_data = ArrayData::builder(dict_data_type) @@ -3115,26 +3005,7 @@ mod tests { assert_eq!(value_data, values.data()); assert_eq!(DataType::Int8, dict_array.value_type()); assert_eq!(2, dict_array.len()); - assert_eq!(Some(Some(3)), dict_array.keys().nth(0)); - assert_eq!(Some(Some(4)), dict_array.keys().nth(1)); - - assert_eq!( - dict_array.keys().collect::>>(), - vec![Some(3), Some(4)] - ); - } - - #[test] - fn test_dictionary_array_key_reverse() { - let test = vec!["a", "a", "b", "c"]; - let array: DictionaryArray = test - .iter() - .map(|&x| if x == "b" { None } else { Some(x) }) - .collect(); - assert_eq!( - array.keys().rev().collect::>>(), - vec![Some(1), None, Some(0), Some(0)] - ); + assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4])); } #[test] @@ -4239,7 +4110,7 @@ mod tests { builder.append(22345678).unwrap(); let array = builder.finish(); assert_eq!( - "DictionaryArray {keys: [Some(0), None, Some(1)] values: PrimitiveArray\n[\n 12345678,\n 22345678,\n]}\n", + "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray\n[\n 12345678,\n 22345678,\n]}\n", format!("{:?}", array) ); @@ -4251,7 +4122,7 @@ mod tests { } let array = builder.finish(); assert_eq!( - "DictionaryArray {keys: [Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)]... values: PrimitiveArray\n[\n 1,\n]}\n", + "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray\n[\n 1,\n]}\n", format!("{:?}", array) ); } @@ -4264,13 +4135,13 @@ mod tests { .map(|&x| if x == "b" { None } else { Some(x) }) .collect(); assert_eq!( - "DictionaryArray {keys: [Some(0), Some(0), None, Some(1)] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", + "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", format!("{:?}", array) ); let array: DictionaryArray = test.into_iter().collect(); assert_eq!( - "DictionaryArray {keys: [Some(0), Some(0), Some(1), Some(2)] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", + "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", format!("{:?}", array) ); } diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs index 7d1122fba46..04280631823 100644 --- a/rust/arrow/src/array/builder.rs +++ b/rust/arrow/src/array/builder.rs @@ -2308,7 +2308,7 @@ where /// /// ``` /// use arrow::datatypes::Int16Type; - /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder}; + /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array}; /// use std::convert::TryFrom; /// /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); @@ -2320,9 +2320,9 @@ where /// /// let dictionary_array = builder.finish(); /// - /// let keys: Vec> = dictionary_array.keys().collect(); + /// let keys = dictionary_array.keys(); /// - /// assert_eq!(keys, vec![Some(2), None, Some(1)]); + /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); /// ``` pub fn new_with_dictionary( keys_builder: PrimitiveBuilder, @@ -3409,8 +3409,10 @@ mod tests { builder.append(22345678).unwrap(); let array = builder.finish(); - // Keys are strongly typed. - let aks: Vec<_> = array.keys().collect(); + assert_eq!( + array.keys(), + &UInt8Array::from(vec![Some(0), None, Some(1)]) + ); // Values are polymorphic and so require a downcast. let av = array.values(); @@ -3421,7 +3423,6 @@ mod tests { assert_eq!(array.is_null(1), true); assert_eq!(array.is_null(2), false); - assert_eq!(aks, vec![Some(0), None, Some(1)]); assert_eq!(avs, &[12345678, 22345678]); } @@ -3437,14 +3438,15 @@ mod tests { builder.append("abc").unwrap(); let array = builder.finish(); - // Keys are strongly typed. - let aks: Vec<_> = array.keys().collect(); + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) + ); // Values are polymorphic and so require a downcast. let av = array.values(); let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - assert_eq!(aks, vec![Some(0), None, Some(1), Some(1), Some(0)]); assert_eq!(ava.value(0), "abc"); assert_eq!(ava.value(1), "def"); } @@ -3465,14 +3467,15 @@ mod tests { builder.append("ghi").unwrap(); let array = builder.finish(); - // Keys are strongly typed. - let aks: Vec<_> = array.keys().collect(); + assert_eq!( + array.keys(), + &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) + ); // Values are polymorphic and so require a downcast. let av = array.values(); let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - assert_eq!(aks, vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]); assert_eq!(ava.is_valid(0), false); assert_eq!(ava.value(1), "def"); assert_eq!(ava.value(2), "abc"); diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs index 26b4cd45a7a..9a0b7e6b053 100644 --- a/rust/arrow/src/array/equal.rs +++ b/rust/arrow/src/array/equal.rs @@ -250,11 +250,9 @@ impl ArrayEqual for DictionaryArray { assert!(other_start_idx + (end_idx - start_idx) <= other.len()); let other = other.as_any().downcast_ref::>().unwrap(); - let iter_a = self.keys().take(end_idx).skip(start_idx); - let iter_b = other.keys().skip(other_start_idx); - // For now, all the values must be the same - iter_a.eq(iter_b) + self.keys() + .range_equals(other.keys(), start_idx, end_idx, other_start_idx) && self .values() .range_equals(&*other.values(), 0, other.values().len(), 0) @@ -910,13 +908,8 @@ impl PartialEq> for Va impl JsonEqual for DictionaryArray { fn equals_json(&self, json: &[&Value]) -> bool { - self.keys().zip(json.iter()).all(|aj| match aj { - (None, Value::Null) => true, - (Some(a), Value::Number(j)) => { - a.to_usize().unwrap() as u64 == j.as_u64().unwrap() - } - _ => false, - }) + // todo: this is wrong: we must test the values also + self.keys().equals_json(json) } } diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs index 2e6ed1264a9..976917f5010 100644 --- a/rust/arrow/src/compute/kernels/filter.rs +++ b/rust/arrow/src/compute/kernels/filter.rs @@ -1052,10 +1052,7 @@ mod tests { // but keys are filtered assert_eq!(2, d.len()); assert_eq!(true, d.is_null(0)); - assert_eq!( - "world", - values.value(d.keys().nth(1).unwrap().unwrap() as usize) - ); + assert_eq!("world", values.value(d.keys().value(1) as usize)); } #[test] diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs index b61e393abf9..cc49e6ca9fc 100644 --- a/rust/arrow/src/compute/kernels/take.rs +++ b/rust/arrow/src/compute/kernels/take.rs @@ -881,8 +881,6 @@ mod tests { assert_eq!(&expected_values, dict_values); assert_eq!(&expected_values, &result_values); - let result_keys: Int16Array = result.keys().collect::>().into(); - let expected_keys = Int16Array::from(vec![ Some(0), Some(0), @@ -892,9 +890,6 @@ mod tests { Some(2), None, ]); - - assert_eq!(expected_keys.len(), result_keys.len()); - assert_eq!(expected_keys.data_type(), result_keys.data_type()); - assert_eq!(expected_keys, result_keys); + assert_eq!(result.keys(), &expected_keys); } } diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 42ae3506469..e0de9602a16 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -1544,10 +1544,9 @@ mod tests { assert_eq!(true, dd.is_valid(2)); assert_eq!(false, dd.is_valid(11)); - let keys: Vec<_> = dd.keys().collect(); assert_eq!( - keys, - vec![ + dd.keys(), + &Int16Array::from(vec![ None, Some(0), Some(1), @@ -1560,7 +1559,7 @@ mod tests { Some(0), Some(0), None - ] + ]) ); } diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs index de8bc7c1d46..0de76392a8f 100644 --- a/rust/parquet/src/arrow/arrow_writer.rs +++ b/rust/parquet/src/arrow/arrow_writer.rs @@ -177,11 +177,7 @@ fn write_leaves( Ok(()) } ArrowDataType::Dictionary(key_type, value_type) => { - use arrow_array::{ - Int16DictionaryArray, Int32DictionaryArray, Int64DictionaryArray, - Int8DictionaryArray, PrimitiveArray, StringArray, UInt16DictionaryArray, - UInt32DictionaryArray, UInt64DictionaryArray, UInt8DictionaryArray, - }; + use arrow_array::{PrimitiveArray, StringArray}; use ArrowDataType::*; use ColumnWriter::*; @@ -217,9 +213,10 @@ fn write_leaves( .unwrap(); use std::convert::TryFrom; - // This removes NULL values from the NullableIter, but + // This removes NULL values from the keys, but // they're encoded by the levels, so that's fine. let materialized_values: Vec<_> = keys + .into_iter() .flatten() .map(|key| { usize::try_from(key).unwrap_or_else(|k| { @@ -250,14 +247,14 @@ fn write_leaves( } dispatch_dictionary!( - Int8, Utf8, ByteArrayColumnWriter => Int8DictionaryArray, StringArray, - Int16, Utf8, ByteArrayColumnWriter => Int16DictionaryArray, StringArray, - Int32, Utf8, ByteArrayColumnWriter => Int32DictionaryArray, StringArray, - Int64, Utf8, ByteArrayColumnWriter => Int64DictionaryArray, StringArray, - UInt8, Utf8, ByteArrayColumnWriter => UInt8DictionaryArray, StringArray, - UInt16, Utf8, ByteArrayColumnWriter => UInt16DictionaryArray, StringArray, - UInt32, Utf8, ByteArrayColumnWriter => UInt32DictionaryArray, StringArray, - UInt64, Utf8, ByteArrayColumnWriter => UInt64DictionaryArray, StringArray, + Int8, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int8Type, StringArray, + Int16, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int16Type, StringArray, + Int32, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int32Type, StringArray, + Int64, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int64Type, StringArray, + UInt8, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt8Type, StringArray, + UInt16, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt16Type, StringArray, + UInt32, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt32Type, StringArray, + UInt64, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt64Type, StringArray, )?; row_group_writer.close_column(col_writer)?; @@ -283,48 +280,40 @@ trait Materialize { fn materialize(&self) -> Vec; } -macro_rules! materialize_string { - ($($k:ty,)*) => { - $(impl Materialize<$k, arrow_array::StringArray> for dyn Array { - type Output = ByteArray; - - fn materialize(&self) -> Vec { - use std::convert::TryFrom; - - let typed_array = self.as_any() - .downcast_ref::<$k>() - .expect("Unable to get dictionary array"); - - let keys = typed_array.keys(); - - let value_buffer = typed_array.values(); - let values = value_buffer - .as_any() - .downcast_ref::() - .unwrap(); - - // This removes NULL values from the NullableIter, but - // they're encoded by the levels, so that's fine. - keys - .flatten() - .map(|key| usize::try_from(key).unwrap_or_else(|k| panic!("key {} does not fit in usize", k))) - .map(|key| values.value(key)) - .map(ByteArray::from) - .collect() - } - })* - }; -} +impl Materialize for dyn Array +where + K: arrow::datatypes::ArrowDictionaryKeyType, +{ + type Output = ByteArray; + + fn materialize(&self) -> Vec { + use arrow::datatypes::ArrowNativeType; -materialize_string! { - arrow_array::Int8DictionaryArray, - arrow_array::Int16DictionaryArray, - arrow_array::Int32DictionaryArray, - arrow_array::Int64DictionaryArray, - arrow_array::UInt8DictionaryArray, - arrow_array::UInt16DictionaryArray, - arrow_array::UInt32DictionaryArray, - arrow_array::UInt64DictionaryArray, + let typed_array = self + .as_any() + .downcast_ref::>() + .expect("Unable to get dictionary array"); + + let keys = typed_array.keys(); + + let value_buffer = typed_array.values(); + let values = value_buffer + .as_any() + .downcast_ref::() + .unwrap(); + + // This removes NULL values from the keys, but + // they're encoded by the levels, so that's fine. + keys.into_iter() + .flatten() + .map(|key| { + key.to_usize() + .unwrap_or_else(|| panic!("key {:?} does not fit in usize", key)) + }) + .map(|key| values.value(key)) + .map(ByteArray::from) + .collect() + } } fn write_dict(