diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs index 5545fce3c45..44ae45beb1f 100644 --- a/rust/arrow/src/array/array_string.rs +++ b/rust/arrow/src/array/array_string.rs @@ -264,6 +264,22 @@ impl From } } +impl From>> + for GenericStringArray +{ + fn from(v: Vec>) -> Self { + GenericStringArray::::from_opt_vec(v) + } +} + +impl From> + for GenericStringArray +{ + fn from(v: Vec<&str>) -> Self { + GenericStringArray::::from_vec(v) + } +} + /// An array where each element is a variable-sized sequence of bytes representing a string /// whose maximum length (in bytes) is represented by a i32. pub type StringArray = GenericStringArray; @@ -284,30 +300,6 @@ impl From for LargeStringArray { } } -impl From> for StringArray { - fn from(v: Vec<&str>) -> Self { - StringArray::from_vec(v) - } -} - -impl From> for LargeStringArray { - fn from(v: Vec<&str>) -> Self { - LargeStringArray::from_vec(v) - } -} - -impl From>> for StringArray { - fn from(v: Vec>) -> Self { - StringArray::from_opt_vec(v) - } -} - -impl From>> for LargeStringArray { - fn from(v: Vec>) -> Self { - LargeStringArray::from_opt_vec(v) - } -} - #[cfg(test)] mod tests { use crate::array::{ListBuilder, StringBuilder}; diff --git a/rust/arrow/src/ffi.rs b/rust/arrow/src/ffi.rs index 1d8d36da6d9..7e64167e4bd 100644 --- a/rust/arrow/src/ffi.rs +++ b/rust/arrow/src/ffi.rs @@ -264,6 +264,16 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { data_type, i ))) } + // Variable-sized binaries: have two buffers. + // LargeUtf8: first buffer is i64, second is in bytes + (DataType::LargeUtf8, 1) => size_of::() * 8, + (DataType::LargeUtf8, 2) => size_of::() * 8, + (DataType::LargeUtf8, _) => { + return Err(ArrowError::CDataInterface(format!( + "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", + data_type, i + ))) + } _ => { return Err(ArrowError::CDataInterface(format!( "The datatype \"{:?}\" is still not supported in Rust implementation", @@ -520,10 +530,11 @@ impl ArrowArray { let data_type = &self.data_type()?; Ok(match (data_type, i) { - (DataType::Utf8, 1) => { + (DataType::Utf8, 1) | (DataType::LargeUtf8, 1) => { // the len of the offset buffer (buffer 1) equals length + 1 let bits = bit_width(data_type, i)?; - bit_util::ceil((self.array.length as usize + 1) * bits, 8) + debug_assert_eq!(bits % 8, 0); + (self.array.length as usize + 1) * (bits / 8) } (DataType::Utf8, 2) => { // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) @@ -537,6 +548,18 @@ impl ArrowArray { // get last offset (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize } + (DataType::LargeUtf8, 2) => { + // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) + let len = self.buffer_len(1)?; + // first buffer is the null buffer => add(1) + // we assume that pointer is aligned for `i64`, as LargeUtf8 uses `i64` offsets. + #[allow(clippy::cast_ptr_alignment)] + let offset_buffer = unsafe { + *(self.array.buffers as *mut *const u8).add(1) as *const i64 + }; + // get last offset + (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize + } // buffer len of primitive types _ => { let bits = bit_width(data_type, i)?; @@ -595,7 +618,10 @@ impl ArrowArray { #[cfg(test)] mod tests { use super::*; - use crate::array::{make_array, Array, ArrayData, Int32Array, StringArray}; + use crate::array::{ + make_array, Array, ArrayData, BooleanArray, GenericStringArray, Int32Array, + StringOffsetSizeTrait, + }; use crate::compute::kernels; use std::convert::TryFrom; use std::sync::Arc; @@ -624,10 +650,10 @@ mod tests { } // case with nulls is tested in the docs, through the example on this module. - #[test] - fn test_string() -> Result<()> { + fn test_generic_string() -> Result<()> { // create an array natively - let array = StringArray::from(vec![Some("a"), None, Some("aaa")]); + let array = + GenericStringArray::::from(vec![Some("a"), None, Some("aaa")]); // export it let array = ArrowArray::try_from(array.data().as_ref().clone())?; @@ -638,10 +664,13 @@ mod tests { // perform some operation let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap(); - let array = array.as_any().downcast_ref::().unwrap(); + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); // verify - let expected = StringArray::from(vec![ + let expected = GenericStringArray::::from(vec![ Some("a"), None, Some("aaa"), @@ -654,4 +683,40 @@ mod tests { // (drop/release) Ok(()) } + + #[test] + fn test_string() -> Result<()> { + test_generic_string::() + } + + #[test] + fn test_large_string() -> Result<()> { + test_generic_string::() + } + + #[test] + fn test_bool() -> Result<()> { + // create an array natively + let array = BooleanArray::from(vec![None, Some(true), Some(false)]); + + // export it + let array = ArrowArray::try_from(array.data().as_ref().clone())?; + + // (simulate consumer) import it + let data = Arc::new(ArrayData::try_from(array)?); + let array = make_array(data); + + // perform some operation + let array = array.as_any().downcast_ref::().unwrap(); + let array = kernels::boolean::not(&array)?; + + // verify + assert_eq!( + array, + BooleanArray::from(vec![None, Some(false), Some(true)]) + ); + + // (drop/release) + Ok(()) + } }