-
Notifications
You must be signed in to change notification settings - Fork 1.1k
fix missing utf8 check for conversion from BinaryViewArray to StringViewArray #9158
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7f0f243
b22ce48
5626fc2
1132a0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -988,14 +988,18 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> | |
|
|
||
| impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> { | ||
| fn from(data: ArrayData) -> Self { | ||
| let (_data_type, len, nulls, offset, buffers, _child_data) = data.into_parts(); | ||
|
|
||
| let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts(); | ||
| assert_eq!( | ||
| data_type, | ||
| T::DATA_TYPE, | ||
| "Mismatched data type, expected {}, got {data_type}", | ||
| T::DATA_TYPE | ||
| ); | ||
| let mut buffers = buffers.into_iter(); | ||
| // first buffer is views, remaining are data buffers | ||
| let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len); | ||
|
|
||
| Self { | ||
| data_type: T::DATA_TYPE, | ||
| data_type, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jhorstmann noted that reusing |
||
| views, | ||
| buffers: Arc::from_iter(buffers), | ||
| nulls, | ||
|
|
@@ -1207,9 +1211,11 @@ mod tests { | |
| Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray, | ||
| }; | ||
| use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; | ||
| use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; | ||
| use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; | ||
| use arrow_schema::DataType; | ||
| use rand::prelude::StdRng; | ||
| use rand::{Rng, SeedableRng}; | ||
| use std::str::from_utf8; | ||
|
|
||
| const BLOCK_SIZE: u32 = 8; | ||
|
|
||
|
|
@@ -1816,4 +1822,46 @@ mod tests { | |
|
|
||
| assert_eq!(lengths_iter.next(), None, "Should not have more lengths"); | ||
| } | ||
|
|
||
| #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")] | ||
| #[test] | ||
| fn invalid_casting_from_array_data() { | ||
| // Should not be able to cast to StringViewArray due to invalid UTF-8 | ||
| let array_data = binary_view_array_with_invalid_utf8_data().into_data(); | ||
| let _ = StringViewArray::from(array_data); | ||
| } | ||
|
|
||
| #[should_panic(expected = "invalid utf-8 sequence")] | ||
| #[test] | ||
| fn invalid_array_data() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test also fails on main but I wanted to make it super clear you can't build an invalid Utf8ViewArray with the ArrayDataBuilder (as expected) |
||
| let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts(); | ||
|
|
||
| // manually try and add invalid array data with Utf8View data type | ||
| let mut builder = ArrayDataBuilder::new(DataType::Utf8View) | ||
| .add_buffer(views.into_inner()) | ||
| .len(3); | ||
| for buffer in buffers.iter() { | ||
| builder = builder.add_buffer(buffer.clone()) | ||
| } | ||
| builder = builder.nulls(nulls); | ||
|
|
||
| let data = builder.build().unwrap(); // should fail validation | ||
| let _arr = StringViewArray::from(data); | ||
| } | ||
|
|
||
| /// Returns a BinaryViewArray with one invalid UTF-8 value | ||
| fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray { | ||
| let array = GenericByteViewArray::<BinaryViewType>::from(vec![ | ||
| b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8], | ||
| &[ | ||
| 0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
| 0x00, 0x00, | ||
| ], | ||
| b"good", | ||
| ]); | ||
| assert!(from_utf8(array.value(0)).is_ok()); | ||
| assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8 | ||
| assert!(from_utf8(array.value(2)).is_ok()); | ||
| array | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the equivalent check in GenericByteArray:
arrow-rs/arrow-array/src/array/byte_array.rs
Line 545 in 90839df