Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 12 additions & 28 deletions rust/arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,9 @@ mod tests {
.build();
let binary_array1 = BinaryArray::from(array_data1);

let array_data2 = ArrayData::builder(DataType::Binary)
let data_type =
DataType::List(Box::new(Field::new("item", DataType::UInt8, false)));
let array_data2 = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
Expand Down Expand Up @@ -818,7 +820,9 @@ mod tests {
.build();
let binary_array1 = LargeBinaryArray::from(array_data1);

let array_data2 = ArrayData::builder(DataType::Binary)
let data_type =
DataType::LargeList(Box::new(Field::new("item", DataType::UInt8, false)));
let array_data2 = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
Expand Down Expand Up @@ -869,41 +873,21 @@ mod tests {

#[test]
#[should_panic(
expected = "BinaryArray can only be created from List<u8> arrays, mismatched \
data types."
)]
fn test_binary_array_from_incorrect_list_array_type() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
.len(12)
.add_buffer(Buffer::from_slice_ref(&values))
.build();
let offsets: [i32; 4] = [0, 5, 5, 12];

let array_data = ArrayData::builder(DataType::Utf8)
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
.build();
let list_array = ListArray::from(array_data);
BinaryArray::from(list_array);
}

#[test]
#[should_panic(
expected = "BinaryArray can only be created from list array of u8 values \
(i.e. List<PrimitiveArray<u8>>)."
expected = "assertion failed: `(left == right)`\n left: `UInt32`,\n \
right: `UInt8`: BinaryArray can only be created from List<u8> arrays, \
mismatched data types."
)]
fn test_binary_array_from_incorrect_list_array() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
.len(12)
.add_buffer(Buffer::from_slice_ref(&values))
.add_child_data(ArrayData::builder(DataType::Boolean).build())
.build();
let offsets: [i32; 4] = [0, 5, 5, 12];

let array_data = ArrayData::builder(DataType::Utf8)
let data_type =
DataType::List(Box::new(Field::new("item", DataType::UInt32, false)));
let array_data = ArrayData::builder(data_type)
.len(3)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_child_data(values_data)
Expand Down
41 changes: 40 additions & 1 deletion rust/arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,19 @@ use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field};

/// trait declaring an offset size, relevant for i32 vs i64 array types.
pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord + std::ops::AddAssign {
fn is_large() -> bool;

fn prefix() -> &'static str;

fn to_isize(&self) -> isize;
}

impl OffsetSizeTrait for i32 {
#[inline]
fn is_large() -> bool {
false
}

fn prefix() -> &'static str {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we won't need prefix anymore with the new is_large.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might still need it, we also use it for formatting in Display

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we can drop it, yes. We can merge StringOffset, BinaryOffset and OffsetTrait in a single Trait with this, but I wanted to leave it to another PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is one way to remove prefix that does not go as far as @jorgecarleitao suggests to collapse the traits... 8e68e05

""
}
Expand All @@ -47,6 +54,11 @@ impl OffsetSizeTrait for i32 {
}

impl OffsetSizeTrait for i64 {
#[inline]
fn is_large() -> bool {
true
}

fn prefix() -> &'static str {
"Large"
}
Expand Down Expand Up @@ -117,6 +129,21 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
GenericListArrayIter::<'a, OffsetSize>::new(&self)
}

#[inline]
fn get_type(data_type: &DataType) -> Option<&DataType> {
if OffsetSize::is_large() {
if let DataType::LargeList(child) = data_type {
Some(child.data_type())
} else {
None
}
} else if let DataType::List(child) = data_type {
Some(child.data_type())
} else {
None
}
}

/// Creates a [`GenericListArray`] from an iterator of primitive values
/// # Example
/// ```
Expand Down Expand Up @@ -193,7 +220,19 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayDataRef> for GenericListArray<Offset
1,
"ListArray should contain a single child array (values array)"
);
let values = make_array(data.child_data()[0].clone());

let values = data.child_data()[0].clone();

if let Some(child) = Self::get_type(data.data_type()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we have a few tests to cover this? checking error message and all.

Also I'm not sure if assert_eq is good here: IMO assertion should only be used for checking internal logic that developer should follow and which are not exposed to the library users, but in this case it appears not. It's just a nit though since this is already used in multiple places before.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you. However, that requires a larger change as we would need to move from From to TryFrom, so for now I just want to avoid unsafe code by panicking everytime something may go wrong.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a mid-way proposal: a31a35a

Basically, to use normal rust handling, but the make the Into implementation expect the result

I actually think using asserts / panics directly (as in this PR) is also fine beacuse:

  1. it is an improvement over the current behavior (crash / undefined) to get useful error messages (even if it is in a panic :( )
  2. the use of ArrayData in my mind is also an implementation detail of an Array so most users of Arrow shouldn't be interacting with this code at all.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I also tried using TryFrom directly and as @jorgecarleitao suspected there are many kernel implementations that rely in this being infallable.

assert_eq!(values.data_type(), child, "[Large]ListArray's child datatype does not correspond to the List's datatype");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: long line? I remember we enforce a limit of 100 characters.

} else {
panic!(
"[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}",
data.data_type()
);
}

let values = make_array(values);
let value_offsets = data.buffers()[0].as_ptr();

let value_offsets = unsafe { RawPtrBox::<OffsetSize>::new(value_offsets) };
Expand Down
2 changes: 1 addition & 1 deletion rust/arrow/src/array/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ where
values_data.data_type().clone(),
true, // TODO: find a consistent way of getting this
));
let data_type = if OffsetSize::prefix() == "Large" {
let data_type = if OffsetSize::is_large() {
DataType::LargeList(field)
} else {
DataType::List(field)
Expand Down
16 changes: 10 additions & 6 deletions rust/arrow/src/compute/kernels/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,10 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
(_, Struct(_)) => Err(ArrowError::ComputeError(
"Cannot cast to struct from other types".to_string(),
)),
(List(_), List(ref to)) => cast_list_inner::<i32>(&**array, to),
(LargeList(_), LargeList(ref to)) => cast_list_inner::<i64>(&**array, to),
(List(_), List(ref to)) => cast_list_inner::<i32>(&**array, to, to_type),
(LargeList(_), LargeList(ref to)) => {
cast_list_inner::<i64>(&**array, to, to_type)
}
(List(list_from), LargeList(list_to)) => {
if list_to.data_type() != list_from.data_type() {
Err(ArrowError::ComputeError(
Expand All @@ -287,8 +289,8 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
(List(_), _) => Err(ArrowError::ComputeError(
"Cannot cast list to non-list data types".to_string(),
)),
(_, List(ref to)) => cast_primitive_to_list::<i32>(array, to),
(_, LargeList(ref to)) => cast_primitive_to_list::<i64>(array, to),
(_, List(ref to)) => cast_primitive_to_list::<i32>(array, to, to_type),
(_, LargeList(ref to)) => cast_primitive_to_list::<i64>(array, to, to_type),
(Dictionary(index_type, _), _) => match **index_type {
DataType::Int8 => dictionary_cast::<Int8Type>(array, to_type),
DataType::Int16 => dictionary_cast::<Int16Type>(array, to_type),
Expand Down Expand Up @@ -1243,6 +1245,7 @@ where
fn cast_primitive_to_list<OffsetSize: OffsetSizeTrait + NumCast>(
array: &ArrayRef,
to: &Field,
to_type: &DataType,
) -> Result<ArrayRef> {
// cast primitive to list's primitive
let cast_array = cast(array, to.data_type())?;
Expand All @@ -1257,7 +1260,7 @@ fn cast_primitive_to_list<OffsetSize: OffsetSizeTrait + NumCast>(
};

let list_data = ArrayData::new(
to.data_type().clone(),
to_type.clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
Expand All @@ -1279,12 +1282,13 @@ fn cast_primitive_to_list<OffsetSize: OffsetSizeTrait + NumCast>(
fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
array: &dyn Array,
to: &Field,
to_type: &DataType,
) -> Result<ArrayRef> {
let data = array.data_ref();
let underlying_array = make_array(data.child_data()[0].clone());
let cast_array = cast(&underlying_array, to.data_type())?;
let array_data = ArrayData::new(
to.data_type().clone(),
to_type.clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
Expand Down
2 changes: 0 additions & 2 deletions rust/arrow/src/ipc/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,6 @@ mod tests {
"generated_dictionary",
// "generated_duplicate_fieldnames",
"generated_interval",
"generated_large_batch",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nevi-me I don't remember seeing this in the original PR -- was this change intended ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NM I see #9587 now

"generated_nested",
// "generated_nested_large_offsets",
"generated_null_trivial",
Expand Down Expand Up @@ -1048,7 +1047,6 @@ mod tests {
"generated_dictionary",
// "generated_duplicate_fieldnames",
"generated_interval",
"generated_large_batch",
"generated_nested",
// "generated_nested_large_offsets",
"generated_null_trivial",
Expand Down