From 7f0f243341eaf0d62c612bc412baa3d7c6610550 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 10:32:19 -0500 Subject: [PATCH 1/3] fix missing utf8 check for conversion from BinaryViewArray to StringViewArray --- arrow-array/src/array/byte_view_array.rs | 54 +++++++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index ca8ddfbe2ad5..6f51a2533246 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -988,7 +988,13 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray impl From for GenericByteViewArray { fn from(data: ArrayData) -> Self { - let (_data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); + let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); + assert_eq!( + data_type, + T::DATA_TYPE, + "Mismatched data type, expected {}, got {data_type}", + T::DATA_TYPE + ); let views = buffers.remove(0); // need to maintain order of remaining buffers let buffers = Arc::from(buffers); let views = ScalarBuffer::new(views, offset, len); @@ -1205,9 +1211,11 @@ mod tests { Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray, }; use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; - use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; + use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; + use arrow_schema::DataType; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; + use std::str::from_utf8; const BLOCK_SIZE: u32 = 8; @@ -1814,4 +1822,46 @@ mod tests { assert_eq!(lengths_iter.next(), None, "Should not have more lengths"); } + + #[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")] + #[test] + fn invalid_casting_from_array_data() { + // Should not be able to cast to StringViewArray due to invalid UTF-8 + let array_data = binary_view_array_with_invalid_utf8_data().into_data(); + let _ = StringViewArray::from(array_data); + } + + #[should_panic(expected = "invalid utf-8 sequence")] + #[test] + fn invalid_array_data() { + let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts(); + + // manually try and add invalid array data with Utf8View data type + let mut builder = ArrayDataBuilder::new(DataType::Utf8View) + .add_buffer(views.into_inner()) + .len(3); + for buffer in buffers.iter() { + builder = builder.add_buffer(buffer.clone()) + } + builder = builder.nulls(nulls); + + let data = builder.build().unwrap(); // should fail validation + let _arr = StringViewArray::from(data); + } + + /// Returns a BinaryViewArray with one invalid UTF-8 value + fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray { + let array = GenericByteViewArray::::from(vec![ + b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8], + &[ + 0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ], + b"good", + ]); + assert!(from_utf8(array.value(0)).is_ok()); + assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8 + assert!(from_utf8(array.value(0)).is_ok()); + array + } } From b22ce483b5fa7c4e22e5a9cc73affbdda7a49d3e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 10:38:11 -0500 Subject: [PATCH 2/3] Avoid DataType::drop --- arrow-array/src/array/byte_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 6f51a2533246..fd7699bf04a0 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -999,7 +999,7 @@ impl From for GenericByteViewArray { let buffers = Arc::from(buffers); let views = ScalarBuffer::new(views, offset, len); Self { - data_type: T::DATA_TYPE, + data_type, views, buffers, nulls, From 5626fc22b90df8892ba4b58d367a00458b9b08c9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 13:28:44 -0500 Subject: [PATCH 3/3] Update arrow-array/src/array/byte_view_array.rs Co-authored-by: Martin Hilton --- arrow-array/src/array/byte_view_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index fd7699bf04a0..1a451ecd71ae 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -1861,7 +1861,7 @@ mod tests { ]); assert!(from_utf8(array.value(0)).is_ok()); assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8 - assert!(from_utf8(array.value(0)).is_ok()); + assert!(from_utf8(array.value(2)).is_ok()); array } }