Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions rust/arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1429,14 +1429,28 @@ impl From<LargeListArray> for LargeBinaryArray {
}
}

/// Like OffsetSizeTrait, but specialized for Strings
// This allow us to expose a constant datatype for the GenericStringArray
pub trait StringOffsetSizeTrait: OffsetSizeTrait {
const DATA_TYPE: DataType;
}

impl StringOffsetSizeTrait for i32 {
const DATA_TYPE: DataType = DataType::Utf8;
}

impl StringOffsetSizeTrait for i64 {
const DATA_TYPE: DataType = DataType::LargeUtf8;
}

/// Generic struct for \[Large\]StringArray
pub struct GenericStringArray<OffsetSize> {
pub struct GenericStringArray<OffsetSize: StringOffsetSizeTrait> {
data: ArrayDataRef,
value_offsets: RawPtrBox<OffsetSize>,
value_data: RawPtrBox<u8>,
}

impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
/// Returns the offset for the element at index `i`.
///
/// Note this doesn't do any bound checking, for performance reason.
Expand Down Expand Up @@ -1559,7 +1573,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
}
}

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}StringArray\n[\n", OffsetSize::prefix())?;
print_long_array(self, f, |array, index, f| {
Expand All @@ -1569,7 +1583,7 @@ impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize>
}
}

impl<OffsetSize: OffsetSizeTrait> Array for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize> {
fn as_any(&self) -> &Any {
self
}
Expand All @@ -1593,8 +1607,15 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericStringArray<OffsetSize> {
}
}

impl<OffsetSize: OffsetSizeTrait> From<ArrayDataRef> for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> From<ArrayDataRef>
for GenericStringArray<OffsetSize>
{
fn from(data: ArrayDataRef) -> Self {
assert_eq!(
data.data_type(),
&<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
"[Large]StringArray expects Datatype::[Large]Utf8"
);
assert_eq!(
data.buffers().len(),
2,
Expand All @@ -1612,7 +1633,7 @@ impl<OffsetSize: OffsetSizeTrait> From<ArrayDataRef> for GenericStringArray<Offs
}
}

impl<OffsetSize: OffsetSizeTrait> ListArrayOps<OffsetSize>
impl<OffsetSize: StringOffsetSizeTrait> ListArrayOps<OffsetSize>
for GenericStringArray<OffsetSize>
{
fn value_offset_at(&self, i: usize) -> OffsetSize {
Expand Down Expand Up @@ -3608,6 +3629,13 @@ mod tests {
}
}

#[test]
#[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")]
fn test_string_array_from_int() {
let array = LargeStringArray::from(vec!["a", "b"]);
StringArray::from(array.data());
}

#[test]
fn test_large_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "parquet"];
Expand Down
18 changes: 11 additions & 7 deletions rust/arrow/src/array/equal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::datatypes::*;
use crate::util::bit_util;
use array::{
Array, GenericBinaryArray, GenericListArray, GenericStringArray, ListArrayOps,
OffsetSizeTrait,
OffsetSizeTrait, StringOffsetSizeTrait,
};
use hex::FromHex;
use serde_json::value::Value::{Null as JNull, Object, String as JString};
Expand Down Expand Up @@ -141,7 +141,7 @@ impl PartialEq for BooleanArray {
}
}

impl<OffsetSize: OffsetSizeTrait> PartialEq for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> PartialEq for GenericStringArray<OffsetSize> {
fn eq(&self, other: &Self) -> bool {
self.equals(other)
}
Expand Down Expand Up @@ -444,7 +444,7 @@ impl<OffsetSize: OffsetSizeTrait> ArrayEqual for GenericBinaryArray<OffsetSize>
}
}

impl<OffsetSize: OffsetSizeTrait> ArrayEqual for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> ArrayEqual for GenericStringArray<OffsetSize> {
fn equals(&self, other: &dyn Array) -> bool {
if !base_equal(&self.data(), &other.data()) {
return false;
Expand Down Expand Up @@ -1063,7 +1063,7 @@ impl<OffsetSize: OffsetSizeTrait> PartialEq<GenericBinaryArray<OffsetSize>> for
}
}

impl<OffsetSize: OffsetSizeTrait> JsonEqual for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> JsonEqual for GenericStringArray<OffsetSize> {
fn equals_json(&self, json: &[&Value]) -> bool {
if self.len() != json.len() {
return false;
Expand All @@ -1077,7 +1077,9 @@ impl<OffsetSize: OffsetSizeTrait> JsonEqual for GenericStringArray<OffsetSize> {
}
}

impl<OffsetSize: OffsetSizeTrait> PartialEq<Value> for GenericStringArray<OffsetSize> {
impl<OffsetSize: StringOffsetSizeTrait> PartialEq<Value>
for GenericStringArray<OffsetSize>
{
fn eq(&self, json: &Value) -> bool {
match json {
Value::Array(json_array) => self.equals_json_values(&json_array),
Expand All @@ -1086,7 +1088,9 @@ impl<OffsetSize: OffsetSizeTrait> PartialEq<Value> for GenericStringArray<Offset
}
}

impl<OffsetSize: OffsetSizeTrait> PartialEq<GenericStringArray<OffsetSize>> for Value {
impl<OffsetSize: StringOffsetSizeTrait> PartialEq<GenericStringArray<OffsetSize>>
for Value
{
fn eq(&self, arrow: &GenericStringArray<OffsetSize>) -> bool {
match self {
Value::Array(json_array) => arrow.equals_json_values(&json_array),
Expand Down Expand Up @@ -1412,7 +1416,7 @@ mod tests {
// assert!(b_slice.equals(&*a_slice));
}

fn test_generic_string_equal<OffsetSize: OffsetSizeTrait>(datatype: DataType) {
fn test_generic_string_equal<OffsetSize: StringOffsetSizeTrait>(datatype: DataType) {
let a = GenericStringArray::<OffsetSize>::from_vec(
vec!["hello", "world"],
datatype.clone(),
Expand Down
1 change: 1 addition & 0 deletions rust/arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ pub use self::array::GenericListArray;
pub use self::array::GenericStringArray;
pub use self::array::OffsetSizeTrait;
pub use self::array::PrimitiveArrayOps;
pub use self::array::StringOffsetSizeTrait;

// --------------------- Array Builder ---------------------

Expand Down
12 changes: 8 additions & 4 deletions rust/arrow/src/compute/kernels/aggregate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

use std::ops::Add;

use crate::array::{Array, GenericStringArray, OffsetSizeTrait, PrimitiveArray};
use crate::array::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait};
use crate::datatypes::ArrowNumericType;

/// Helper macro to perform min/max of strings
fn min_max_string<T: OffsetSizeTrait, F: Fn(&str, &str) -> bool>(
fn min_max_string<T: StringOffsetSizeTrait, F: Fn(&str, &str) -> bool>(
array: &GenericStringArray<T>,
cmp: F,
) -> Option<&str> {
Expand Down Expand Up @@ -73,12 +73,16 @@ where
}

/// Returns the maximum value in the string array, according to the natural order.
pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
pub fn max_string<T: StringOffsetSizeTrait>(
array: &GenericStringArray<T>,
) -> Option<&str> {
min_max_string(array, |a, b| a < b)
}

/// Returns the minimum value in the string array, according to the natural order.
pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
pub fn min_string<T: StringOffsetSizeTrait>(
array: &GenericStringArray<T>,
) -> Option<&str> {
min_max_string(array, |a, b| a > b)
}

Expand Down
2 changes: 1 addition & 1 deletion rust/arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ pub fn contains_utf8<OffsetSize>(
right: &ListArray,
) -> Result<BooleanArray>
where
OffsetSize: OffsetSizeTrait,
OffsetSize: StringOffsetSizeTrait,
{
let left_len = left.len();
if left_len != right.len() {
Expand Down
11 changes: 4 additions & 7 deletions rust/arrow/src/compute/kernels/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@ use crate::{
};
use std::sync::Arc;

fn substring1<OffsetSize: OffsetSizeTrait>(
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: &Option<OffsetSize>,
datatype: DataType,
) -> Result<ArrayRef> {
// compute current offsets
let offsets = array.data_ref().clone().buffers()[0].clone();
Expand Down Expand Up @@ -76,7 +75,7 @@ fn substring1<OffsetSize: OffsetSizeTrait>(
});

let data = ArrayData::new(
datatype,
<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
array.len(),
None,
null_bit_buffer,
Expand All @@ -95,23 +94,21 @@ fn substring1<OffsetSize: OffsetSizeTrait>(
/// this function errors when the passed array is not a \[Large\]String array.
pub fn substring(array: &Array, start: i64, length: &Option<u64>) -> Result<ArrayRef> {
match array.data_type() {
DataType::LargeUtf8 => substring1(
DataType::LargeUtf8 => generic_substring(
array
.as_any()
.downcast_ref::<LargeStringArray>()
.expect("A large string is expected"),
start,
&length.map(|e| e as i64),
DataType::LargeUtf8,
),
DataType::Utf8 => substring1(
DataType::Utf8 => generic_substring(
array
.as_any()
.downcast_ref::<StringArray>()
.expect("A string is expected"),
start as i32,
&length.map(|e| e as i32),
DataType::Utf8,
),
_ => Err(ArrowError::ComputeError(format!(
"substring does not support type {:?}",
Expand Down
14 changes: 5 additions & 9 deletions rust/arrow/src/compute/kernels/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ pub fn take(
DataType::Duration(TimeUnit::Nanosecond) => {
take_primitive::<DurationNanosecondType>(values, indices)
}
DataType::Utf8 => take_string::<i32>(values, indices, DataType::Utf8),
DataType::LargeUtf8 => take_string::<i64>(values, indices, DataType::LargeUtf8),
DataType::Utf8 => take_string::<i32>(values, indices),
DataType::LargeUtf8 => take_string::<i64>(values, indices),
DataType::List(_) => take_list(values, indices),
DataType::Struct(fields) => {
let struct_: &StructArray =
Expand Down Expand Up @@ -262,13 +262,9 @@ fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
}

/// `take` implementation for string arrays
fn take_string<OffsetSize>(
values: &ArrayRef,
indices: &UInt32Array,
data_type: DataType,
) -> Result<ArrayRef>
fn take_string<OffsetSize>(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef>
where
OffsetSize: Zero + AddAssign + OffsetSizeTrait,
OffsetSize: Zero + AddAssign + StringOffsetSizeTrait,
{
let data_len = indices.len();

Expand Down Expand Up @@ -306,7 +302,7 @@ where
None => null_buf.freeze(),
};

let data = ArrayData::builder(data_type)
let data = ArrayData::builder(<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE)
.len(data_len)
.null_bit_buffer(nulls)
.add_buffer(Buffer::from(offsets.to_byte_slice()))
Expand Down