diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 0b90a78577e5..9d2d7761725b 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -68,7 +68,10 @@ jobs: - name: Test arrow-schema run: cargo test -p arrow-schema --all-features - name: Test arrow-array - run: cargo test -p arrow-array --all-features + run: | + cargo test -p arrow-array --all-features + # Disable feature `force_validate` + cargo test -p arrow-array --features=ffi - name: Test arrow-select run: cargo test -p arrow-select --all-features - name: Test arrow-cast diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index f3c34f6ccd13..2ee2fd379ed8 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -408,7 +408,17 @@ impl ImportedArrowArray<'_> { .map(|index| { let len = self.buffer_len(index, variadic_buffer_lens, &self.data_type)?; match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } { - Some(buf) => Ok(buf), + Some(buf) => { + // External libraries may use a dangling pointer for a buffer with length 0. + // We respect the array length specified in the C Data Interface. Actually, + // if the length is incorrect, we cannot create a correct buffer even if + // the pointer is valid. + if buf.is_empty() { + Ok(MutableBuffer::new(0).into()) + } else { + Ok(buf) + } + } None if len == 0 => { // Null data buffer, which Rust doesn't allow. So create // an empty buffer. @@ -1296,9 +1306,15 @@ mod tests_to_then_from_ffi { #[cfg(test)] mod tests_from_ffi { + #[cfg(not(feature = "force_validate"))] + use std::ptr::NonNull; use std::sync::Arc; + #[cfg(feature = "force_validate")] use arrow_buffer::{bit_util, buffer::Buffer}; + #[cfg(not(feature = "force_validate"))] + use arrow_buffer::{bit_util, buffer::Buffer, ScalarBuffer}; + use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; @@ -1660,6 +1676,25 @@ mod tests_from_ffi { } } + #[test] + #[cfg(not(feature = "force_validate"))] + fn test_utf8_view_ffi_from_dangling_pointer() { + let empty = GenericByteViewBuilder::::new().finish(); + let buffers = empty.data_buffers().to_vec(); + let nulls = empty.nulls().cloned(); + + // Create a dangling pointer to a view buffer with zero length. + let alloc = Arc::new(1); + let buffer = unsafe { Buffer::from_custom_allocation(NonNull::::dangling(), 0, alloc) }; + let views = unsafe { ScalarBuffer::new_unchecked(buffer) }; + + let str_view: GenericByteViewArray = + unsafe { GenericByteViewArray::new_unchecked(views, buffers, nulls) }; + let imported = roundtrip_byte_view_array(str_view); + assert_eq!(imported.len(), 0); + assert_eq!(&imported, &empty); + } + #[test] fn test_round_trip_byte_view() { fn test_case() diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 6c66060fb95f..4dd516c708ac 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -72,6 +72,19 @@ impl ScalarBuffer { buffer.slice_with_length(byte_offset, byte_len).into() } + /// Unsafe function to create a new [`ScalarBuffer`] from a [`Buffer`]. + /// Only use for testing purpose. + /// + /// # Safety + /// + /// This function is unsafe because it does not check if the `buffer` is aligned + pub unsafe fn new_unchecked(buffer: Buffer) -> Self { + Self { + buffer, + phantom: Default::default(), + } + } + /// Free up unused memory. pub fn shrink_to_fit(&mut self) { self.buffer.shrink_to_fit(); @@ -99,6 +112,16 @@ impl ScalarBuffer { pub fn ptr_eq(&self, other: &Self) -> bool { self.buffer.ptr_eq(&other.buffer) } + + /// Returns the number of elements in the buffer + pub fn len(&self) -> usize { + self.buffer.len() / std::mem::size_of::() + } + + /// Returns if the buffer is empty + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } impl Deref for ScalarBuffer {