Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 16 additions & 24 deletions rust/arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,22 @@ impl<OffsetSize: StringOffsetSizeTrait> From<ArrayDataRef>
}
}

impl<OffsetSize: StringOffsetSizeTrait> From<Vec<Option<&str>>>
for GenericStringArray<OffsetSize>
{
fn from(v: Vec<Option<&str>>) -> Self {
GenericStringArray::<OffsetSize>::from_opt_vec(v)
}
}

impl<OffsetSize: StringOffsetSizeTrait> From<Vec<&str>>
for GenericStringArray<OffsetSize>
{
fn from(v: Vec<&str>) -> Self {
GenericStringArray::<OffsetSize>::from_vec(v)
}
}

/// An array where each element is a variable-sized sequence of bytes representing a string
/// whose maximum length (in bytes) is represented by a i32.
pub type StringArray = GenericStringArray<i32>;
Expand All @@ -284,30 +300,6 @@ impl From<LargeListArray> for LargeStringArray {
}
}

impl From<Vec<&str>> for StringArray {
fn from(v: Vec<&str>) -> Self {
StringArray::from_vec(v)
}
}

impl From<Vec<&str>> for LargeStringArray {
fn from(v: Vec<&str>) -> Self {
LargeStringArray::from_vec(v)
}
}

impl From<Vec<Option<&str>>> for StringArray {
fn from(v: Vec<Option<&str>>) -> Self {
StringArray::from_opt_vec(v)
}
}

impl From<Vec<Option<&str>>> for LargeStringArray {
fn from(v: Vec<Option<&str>>) -> Self {
LargeStringArray::from_opt_vec(v)
}
}

#[cfg(test)]
mod tests {
use crate::array::{ListBuilder, StringBuilder};
Expand Down
81 changes: 73 additions & 8 deletions rust/arrow/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,16 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
data_type, i
)))
}
// Variable-sized binaries: have two buffers.
// LargeUtf8: first buffer is i64, second is in bytes
(DataType::LargeUtf8, 1) => size_of::<i64>() * 8,
(DataType::LargeUtf8, 2) => size_of::<u8>() * 8,
(DataType::LargeUtf8, _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
data_type, i
)))
}
_ => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{:?}\" is still not supported in Rust implementation",
Expand Down Expand Up @@ -520,10 +530,11 @@ impl ArrowArray {
let data_type = &self.data_type()?;

Ok(match (data_type, i) {
(DataType::Utf8, 1) => {
(DataType::Utf8, 1) | (DataType::LargeUtf8, 1) => {
// the len of the offset buffer (buffer 1) equals length + 1
let bits = bit_width(data_type, i)?;
bit_util::ceil((self.array.length as usize + 1) * bits, 8)
debug_assert_eq!(bits % 8, 0);
(self.array.length as usize + 1) * (bits / 8)
}
(DataType::Utf8, 2) => {
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
Expand All @@ -537,6 +548,18 @@ impl ArrowArray {
// get last offset
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
}
(DataType::LargeUtf8, 2) => {
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
let len = self.buffer_len(1)?;
// first buffer is the null buffer => add(1)
// we assume that pointer is aligned for `i64`, as LargeUtf8 uses `i64` offsets.
#[allow(clippy::cast_ptr_alignment)]
let offset_buffer = unsafe {
*(self.array.buffers as *mut *const u8).add(1) as *const i64
};
// get last offset
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
}
// buffer len of primitive types
_ => {
let bits = bit_width(data_type, i)?;
Expand Down Expand Up @@ -595,7 +618,10 @@ impl ArrowArray {
#[cfg(test)]
mod tests {
use super::*;
use crate::array::{make_array, Array, ArrayData, Int32Array, StringArray};
use crate::array::{
make_array, Array, ArrayData, BooleanArray, GenericStringArray, Int32Array,
StringOffsetSizeTrait,
};
use crate::compute::kernels;
use std::convert::TryFrom;
use std::sync::Arc;
Expand Down Expand Up @@ -624,10 +650,10 @@ mod tests {
}
// case with nulls is tested in the docs, through the example on this module.

#[test]
fn test_string() -> Result<()> {
fn test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()> {
// create an array natively
let array = StringArray::from(vec![Some("a"), None, Some("aaa")]);
let array =
GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]);

// export it
let array = ArrowArray::try_from(array.data().as_ref().clone())?;
Expand All @@ -638,10 +664,13 @@ mod tests {

// perform some operation
let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
let array = array.as_any().downcast_ref::<StringArray>().unwrap();
let array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();

// verify
let expected = StringArray::from(vec![
let expected = GenericStringArray::<Offset>::from(vec![
Some("a"),
None,
Some("aaa"),
Expand All @@ -654,4 +683,40 @@ mod tests {
// (drop/release)
Ok(())
}

#[test]
fn test_string() -> Result<()> {
test_generic_string::<i32>()
}

#[test]
fn test_large_string() -> Result<()> {
test_generic_string::<i64>()
}

#[test]
fn test_bool() -> Result<()> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice and neat :)

// create an array natively
let array = BooleanArray::from(vec![None, Some(true), Some(false)]);

// export it
let array = ArrowArray::try_from(array.data().as_ref().clone())?;

// (simulate consumer) import it
let data = Arc::new(ArrayData::try_from(array)?);
let array = make_array(data);

// perform some operation
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
let array = kernels::boolean::not(&array)?;

// verify
assert_eq!(
array,
BooleanArray::from(vec![None, Some(false), Some(true)])
);

// (drop/release)
Ok(())
}
}