diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index ecf1850ad454..7fdab31e6aac 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1211,9 +1211,7 @@ mod tests { } #[test] - #[should_panic( - expected = "FixedSizeBinaryArray can only be created from FixedSizeList arrays" - )] + #[should_panic(expected = "Expected Binary but child data had UInt32")] fn test_fixed_size_binary_array_from_incorrect_list_array() { let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; let values_data = ArrayData::builder(DataType::UInt32) @@ -1222,15 +1220,15 @@ mod tests { .build() .unwrap(); - let array_data = unsafe { - ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), - 4, - )) - .len(3) - .add_child_data(values_data) - .build_unchecked() - }; + let array_data = ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Binary, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(array_data); drop(FixedSizeBinaryArray::from(list_array)); } @@ -1375,4 +1373,34 @@ mod tests { assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()) } + + #[test] + fn test_binary_array_all_null() { + let data = vec![None]; + let array = BinaryArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } + + #[test] + fn test_large_binary_array_all_null() { + let data = vec![None]; + let array = LargeBinaryArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } + + #[test] + fn fixed_size_binary_array_all_null() { + let data = vec![None] as Vec>; + let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } } diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index 3b1959431bfc..583938456bd4 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -329,14 +329,11 @@ mod tests { } #[test] - #[should_panic(expected = "BooleanArray data should contain a single buffer only \ - (values buffer)")] + #[should_panic(expected = "Expected 1 buffers in array of type Boolean, got 0")] fn test_boolean_array_invalid_buffer_len() { - let data = unsafe { - ArrayData::builder(DataType::Boolean) - .len(5) - .build_unchecked() - }; - drop(BooleanArray::from(data)); + ArrayData::builder(DataType::Boolean) + .len(5) + .build() + .unwrap(); } } diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index c684c253aa7b..75d4f3ab3fad 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -412,4 +412,14 @@ mod tests { assert_eq!(1, keys.value(2)); assert_eq!(0, keys.value(5)); } + + #[test] + fn test_dictionary_all_nulls() { + let test = vec![None, None, None]; + let array: DictionaryArray = test.into_iter().collect(); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } } diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index 7de357a1dc20..f3321339db7d 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -750,7 +750,7 @@ mod tests { #[test] #[should_panic( - expected = "FixedSizeListArray child array length should be a multiple of 3" + expected = "Values length 8 is less than the length (3) multiplied by the value size (3) for FixedSizeList" )] fn test_fixed_size_list_array_unequal_children() { // Construct a value array @@ -765,13 +765,12 @@ mod tests { Box::new(Field::new("item", DataType::Int32, false)), 3, ); - let list_data = unsafe { - ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build_unchecked() - }; - drop(FixedSizeListArray::from(list_data)); + + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build() + .unwrap(); } #[test] @@ -1037,42 +1036,35 @@ mod tests { } #[test] - #[should_panic( - expected = "ListArray data should contain a single buffer only (value offsets)" - )] + #[should_panic(expected = "Expected 1 buffers in array of type")] fn test_list_array_invalid_buffer_len() { - let value_data = unsafe { - ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build_unchecked() - }; + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = unsafe { - ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build_unchecked() - }; - drop(ListArray::from(list_data)); + + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build() + .unwrap(); } #[test] - #[should_panic( - expected = "ListArray should contain a single child array (values array)" - )] + #[should_panic(expected = "should contain 1 child data array(s), had 0")] fn test_list_array_invalid_child_array_len() { let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]); let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = unsafe { - ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .build_unchecked() - }; - drop(ListArray::from(list_data)); + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build() + .unwrap(); } #[test] @@ -1126,13 +1118,12 @@ mod tests { let list_data_type = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = unsafe { - ArrayData::builder(list_data_type) - .add_buffer(buf2) - .add_child_data(value_data) - .build_unchecked() - }; - drop(ListArray::from(list_data)); + + ArrayData::builder(list_data_type) + .add_buffer(buf2) + .add_child_data(value_data) + .build() + .unwrap(); } #[test] diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index ac56b4a97cc7..ddc299870751 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -498,12 +498,7 @@ impl PrimitiveArray { /// Constructs a `PrimitiveArray` from an array data reference. impl From for PrimitiveArray { fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "PrimitiveArray data should contain a single buffer only (values buffer)" - ); - + // safety: ArrayData was checked for validity on construction let ptr = data.buffers()[0].as_ptr(); Self { data, @@ -947,19 +942,15 @@ mod tests { } #[test] - #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ - (values buffer)")] + #[should_panic(expected = "Expected 1 buffers in array of type Int32, got 2")] fn test_primitive_array_invalid_buffer_len() { let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]); - let data = unsafe { - ArrayData::builder(DataType::Int32) - .add_buffer(buffer.clone()) - .add_buffer(buffer) - .len(5) - .build_unchecked() - }; - - drop(Int32Array::from(data)); + ArrayData::builder(DataType::Int32) + .add_buffer(buffer.clone()) + .add_buffer(buffer) + .len(5) + .build() + .unwrap(); } #[test] diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 0d8dcf1a68e3..82c9f16fe99d 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -554,13 +554,22 @@ mod tests { } #[test] - fn test_string_array_from_string_vec() { - let data = vec!["Foo".to_owned(), "Bar".to_owned(), "Baz".to_owned()]; + fn test_string_array_all_null() { + let data = vec![None]; let array = StringArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } - assert_eq!(array.len(), 3); - assert_eq!(array.value(0), "Foo"); - assert_eq!(array.value(1), "Bar"); - assert_eq!(array.value(2), "Baz"); + #[test] + fn test_large_string_array_all_null() { + let data = vec![None]; + let array = LargeStringArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); } } diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index da609b855a67..f78e8e1ce786 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -28,6 +28,7 @@ use crate::{ use half::f16; use std::convert::TryInto; use std::mem; +use std::ops::Range; use std::sync::Arc; use super::equal::equal; @@ -293,7 +294,7 @@ impl ArrayData { Some(null_count) => null_count, }; let null_bitmap = null_bit_buffer.map(Bitmap::from); - Self { + let new_self = Self { data_type, len, null_count, @@ -301,7 +302,12 @@ impl ArrayData { buffers, child_data, null_bitmap, - } + }; + + // Bug discovery mechanism: call validate_full here + new_self.validate_full().unwrap(); + + new_self } /// Create a new ArrayData, validating that the provided buffers @@ -322,7 +328,7 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Result { - // Safety justification: `validate` is (will be) called below + // Safety justification: `validate_full` is called below let new_self = unsafe { Self::new_unchecked( data_type, @@ -335,7 +341,8 @@ impl ArrayData { ) }; - new_self.validate()?; + // As the data is not trusted, do a full validation of its contents + new_self.validate_full()?; Ok(new_self) } @@ -582,7 +589,8 @@ impl ArrayData { /// contents of the buffers (e.g. that all offsets for UTF8 arrays /// are within the bounds of the values buffer). /// - /// TODO: add a validate_full that validates the offsets and valid utf8 data + /// See [`validate_full`] to validate fully the offset content + /// and the validitiy of utf8 data pub fn validate(&self) -> Result<()> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; @@ -689,19 +697,20 @@ impl ArrayData { Ok(()) } - /// Does a cheap sanity check that the `self.len` values in `buffer` are valid - /// offsets (of type T> into some other buffer of `values_length` bytes long - fn validate_offsets( - &self, - buffer: &Buffer, - values_length: usize, - ) -> Result<()> { + /// Returns a reference to the data in `buffer` as a typed slice + /// (typically `&[i32]` or `&[i64]`) after validating. The + /// returned slice is guaranteed to have at least `self.len + 1` + /// entries + fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>( + &'a self, + buffer: &'a Buffer, + ) -> Result<&'a [T]> { // Validate that there are the correct number of offsets for this array's length let required_offsets = self.len + self.offset + 1; // An empty list-like array can have 0 offsets if buffer.is_empty() { - return Ok(()); + return Ok(&[]); } if (buffer.len() / std::mem::size_of::()) < required_offsets { @@ -712,7 +721,23 @@ impl ArrayData { } // Justification: buffer size was validated above - let offsets = unsafe { &(buffer.typed_data::()[self.offset..]) }; + Ok(unsafe { + &(buffer.typed_data::()[self.offset..self.offset + self.len + 1]) + }) + } + + /// Does a cheap sanity check that the `self.len` values in `buffer` are valid + /// offsets (of type T) into some other buffer of `values_length` bytes long + fn validate_offsets( + &self, + buffer: &Buffer, + values_length: usize, + ) -> Result<()> { + // Justification: buffer size was validated above + let offsets = self.typed_offsets::(buffer)?; + if offsets.is_empty() { + return Ok(()); + } let first_offset = offsets[0].to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( @@ -878,6 +903,250 @@ impl ArrayData { values_data.validate()?; Ok(values_data) } + + /// "expensive" validation that ensures: + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 3. All dictionary offsets are valid + /// + /// Does not (yet) check + /// 1. Union type_ids are valid (see https://github.com/apache/arrow-rs/issues/85) + /// Note calls `validate()` internally + pub fn validate_full(&self) -> Result<()> { + // Check all buffer sizes prior to looking at them more deeply in this function + self.validate()?; + + let null_bitmap_buffer = self + .null_bitmap + .as_ref() + .map(|null_bitmap| null_bitmap.buffer_ref()); + + let actual_null_count = count_nulls(null_bitmap_buffer, self.offset, self.len); + if actual_null_count != self.null_count { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count value ({}) doesn't match actual number of nulls in array ({})", + self.null_count, actual_null_count + ))); + } + + match &self.data_type { + DataType::Utf8 => { + self.validate_utf8::()?; + } + DataType::LargeUtf8 => { + self.validate_utf8::()?; + } + DataType::Binary => { + self.validate_offsets_full::(self.buffers[1].len())?; + } + DataType::LargeBinary => { + self.validate_offsets_full::(self.buffers[1].len())?; + } + DataType::List(_) | DataType::Map(_, _) => { + let child = &self.child_data[0]; + self.validate_offsets_full::(child.len + child.offset)?; + } + DataType::LargeList(_) => { + let child = &self.child_data[0]; + self.validate_offsets_full::(child.len + child.offset)?; + } + DataType::Union(_) => { + // Validate Union Array as part of implementing new Union semantics + // See comments in `ArrayData::validate()` + // https://github.com/apache/arrow-rs/issues/85 + } + DataType::Dictionary(key_type, _value_type) => { + let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); + let max_value = dictionary_length - 1; + match key_type.as_ref() { + DataType::UInt8 => self.check_bounds::(max_value)?, + DataType::UInt16 => self.check_bounds::(max_value)?, + DataType::UInt32 => self.check_bounds::(max_value)?, + DataType::UInt64 => self.check_bounds::(max_value)?, + DataType::Int8 => self.check_bounds::(max_value)?, + DataType::Int16 => self.check_bounds::(max_value)?, + DataType::Int32 => self.check_bounds::(max_value)?, + DataType::Int64 => self.check_bounds::(max_value)?, + _ => unreachable!(), + } + } + _ => { + // No extra validation check required for other types + } + }; + + // validate all children recursively + self.child_data + .iter() + .enumerate() + .try_for_each(|(i, child_data)| { + child_data.validate_full().map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "{} child #{} invalid: {}", + self.data_type, i, e + )) + }) + })?; + + Ok(()) + } + + /// Calls the `validate(item_index, range)` function for each of + /// the ranges specified in the arrow offset buffer of type + /// `T`. Also validates that each offset is smaller than + /// `max_offset` + /// + /// For example, the offset buffer contained `[1, 2, 4]`, this + /// function would call `validate([1,2])`, and `validate([2,4])` + fn validate_each_offset( + &self, + offset_buffer: &Buffer, + offset_limit: usize, + validate: V, + ) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + V: Fn(usize, Range) -> Result<()>, + { + // An empty binary-like array can have 0 offsets + if self.len == 0 && offset_buffer.is_empty() { + return Ok(()); + } + + let offsets = self.typed_offsets::(offset_buffer)?; + + offsets + .iter() + .zip(offsets.iter().skip(1)) + .enumerate() + .map(|(i, (&start_offset, &end_offset))| { + let start_offset: usize = start_offset + .try_into() + .map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: could not convert start_offset {} to usize in slot {}", + start_offset, i)) + })?; + let end_offset: usize = end_offset + .try_into() + .map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: Could not convert end_offset {} to usize in slot {}", + end_offset, i+1)) + })?; + + if start_offset > offset_limit { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset for slot {} out of bounds: {} > {}", + i, start_offset, offset_limit)) + ); + } + + if end_offset > offset_limit { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset for slot {} out of bounds: {} > {}", + i, end_offset, offset_limit)) + ); + } + + // check range actually is low -> high + if start_offset > end_offset { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", + i, start_offset, end_offset)) + ); + } + + Ok((i, start_offset..end_offset)) + }) + .try_for_each(|res: Result<(usize, Range)>| { + let (item_index, range) = res?; + validate(item_index, range) + }) + } + + /// Ensures that all strings formed by the offsets in buffers[0] + /// into buffers[1] are valid utf8 sequences + fn validate_utf8(&self) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let offset_buffer = &self.buffers[0]; + let values_buffer = &self.buffers[1].as_slice(); + + self.validate_each_offset::( + offset_buffer, + values_buffer.len(), + |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {} ({:?}): {}", + string_index, range, e + )) + })?; + Ok(()) + }, + ) + } + + /// Ensures that all offsets in buffers[0] into buffers[1] are + /// between `0` and `offset_limit` + fn validate_offsets_full(&self, offset_limit: usize) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let offset_buffer = &self.buffers[0]; + + self.validate_each_offset::( + offset_buffer, + offset_limit, + |_string_index, _range| { + // No validation applied to each value, but the iteration + // itself applies bounds checking to each range + Ok(()) + }, + ) + } + + /// Validates that each value in self.buffers (typed as T) + /// is within the range [0, max_value], inclusive + fn check_bounds(&self, max_value: i64) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let required_len = self.len + self.offset; + let buffer = &self.buffers[0]; + + // This should have been checked as part of `validate()` prior + // to calling `validate_full()` but double check to be sure + assert!(buffer.len() / std::mem::size_of::() >= required_len); + + // Justification: buffer size was validated above + let indexes: &[T] = unsafe { &(buffer.typed_data::()[self.offset..]) }; + + indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { + // Do not check the value is null (value can be arbitrary) + if self.is_null(i) { + return Ok(()); + } + let dict_index: i64 = dict_index.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (can not convert to i64)", + i, dict_index + )) + })?; + + if dict_index < 0 || dict_index > max_value { + return Err(ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (should be in [0, {}])", + i, dict_index, max_value + ))); + } + Ok(()) + }) + } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -899,7 +1168,7 @@ fn layout(data_type: &DataType) -> DataTypeLayout { DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::()), DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::()), DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Float16 => unimplemented!(), + DataType::Float16 => DataTypeLayout::new_fixed_width(size_of::()), DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::()), DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::()), DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::()), @@ -1118,6 +1387,7 @@ mod tests { use crate::array::{ Array, BooleanBuilder, Int32Array, Int32Builder, StringArray, StructBuilder, + UInt64Array, }; use crate::buffer::Buffer; use crate::datatypes::Field; @@ -1635,6 +1905,334 @@ mod tests { .unwrap(); } + /// Test that the array of type `data_type` that has invalid utf8 data errors + fn check_utf8_validation(data_type: DataType) { + // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself + let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); + let offsets: Vec = [0, 2, 3] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] + fn test_validate_utf8_content() { + check_utf8_validation::(DataType::Utf8); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] + fn test_validate_large_utf8_content() { + check_utf8_validation::(DataType::LargeUtf8); + } + + /// Test that the array of type `data_type` that has invalid indexes (out of bounds) + fn check_index_out_of_bounds_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First two offsets are fine, then 5 is out of bounds + let offsets: Vec = [0, 1, 2, 5, 2] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Utf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_large_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeUtf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Binary); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_large_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeBinary); + } + + // validate that indexes don't go bacwards check indexes that go backwards + fn check_index_backwards_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First three offsets are fine, then 1 goes backwards + let offsets: Vec = [0, 1, 2, 2, 1] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_utf8_index_backwards() { + check_index_backwards_validation::(DataType::Utf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_large_utf8_index_backwards() { + check_index_backwards_validation::(DataType::LargeUtf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_binary_index_backwards() { + check_index_backwards_validation::(DataType::Binary); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_large_binary_index_backwards() { + check_index_backwards_validation::(DataType::LargeBinary); + } + + #[test] + #[should_panic( + expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" + )] + fn test_validate_dictionary_index_too_large() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // 3 is not a valid index into the values (only 0 and 1) + let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]" + )] + fn test_validate_dictionary_index_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" + )] + fn test_validate_dictionary_index_giant_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + /// Test that the list of type `data_type` generates correct offset out of bounds errors + fn check_list_offsets(data_type: DataType) { + let values: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + // 5 is an invalid offset into a list of only three values + let offsets: Vec = [0, 2, 5, 4] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + None, + 0, + vec![offsets_buffer], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + )] + fn test_validate_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::List(Box::new(field_type))); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + )] + fn test_validate_large_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::LargeList(Box::new(field_type))); + } + + /// Test that the list of type `data_type` generates correct errors for negative offsets + #[test] + #[should_panic( + expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2" + )] + fn test_validate_list_negative_offsets() { + let values: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let field_type = Field::new("f", values.data_type().clone(), true); + let data_type = DataType::List(Box::new(field_type)); + + // -1 is an invalid offset any way you look at it + let offsets: Vec = vec![0, 2, -1, 4]; + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + None, + 0, + vec![offsets_buffer], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])" + )] + /// test that children are validated recursively (aka bugs in child data of struct also are flagged) + fn test_validate_recursive() { + // Form invalid dictionary array + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + // -1 is not a valid index + let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); + + let dict_data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // purposely create an invalid child data + let dict_data = unsafe { + ArrayData::new_unchecked( + dict_data_type, + 1, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + }; + + // Now, try and create a struct with this invalid child data (and expect an error) + let data_type = + DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); + + ArrayData::try_new(data_type, 1, None, None, 0, vec![], vec![dict_data]).unwrap(); + } + /// returns a buffer initialized with some constant value for tests fn make_i32_buffer(n: usize) -> Buffer { Buffer::from_slice_ref(&vec![42i32; n]) diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs index 5ba2548190db..3031b99cdfdb 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow/src/buffer/immutable.rs @@ -176,7 +176,8 @@ impl Buffer { // * The pointer is non-null by construction // * alignment asserted below. let (prefix, offsets, suffix) = self.as_slice().align_to::(); - assert!(prefix.is_empty() && suffix.is_empty()); + assert!(prefix.is_empty() && suffix.is_empty(), + "Buffer memory is not aligned, prefix and suffix should be empty but were not"); offsets } diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index 761c5a6781bf..841e494e34ff 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -670,6 +670,7 @@ mod tests { } #[test] + #[ignore] fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); let path = format!("{}/nested_maps.snappy.parquet", testdata);