Skip to content

Commit

Permalink
char_length() ascii fast path
Browse files Browse the repository at this point in the history
  • Loading branch information
2010YOUY01 committed Sep 6, 2024
1 parent c29603b commit eae8d7e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 6 deletions.
11 changes: 11 additions & 0 deletions datafusion/functions/src/string/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,18 +351,29 @@ pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
///
/// This iterator iterates returns `Option<&str>` for each item in the array.
fn iter(&self) -> ArrayIter<Self>;

/// Check if the array is ASCII only.
fn is_ascii(&self) -> bool;
}

impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> {
fn iter(&self) -> ArrayIter<Self> {
GenericStringArray::<T>::iter(self)
}

fn is_ascii(&self) -> bool {
GenericStringArray::<T>::is_ascii(self)
}
}

impl<'a> StringArrayType<'a> for &'a StringViewArray {
fn iter(&self) -> ArrayIter<Self> {
StringViewArray::iter(self)
}

fn is_ascii(&self) -> bool {
StringViewArray::is_ascii(self)
}
}

/// Optimized version of the StringBuilder in Arrow that:
Expand Down
24 changes: 18 additions & 6 deletions datafusion/functions/src/unicode/character_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.

use crate::string::common::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
Array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
OffsetSizeTrait, PrimitiveArray,
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
use datafusion_common::Result;
Expand Down Expand Up @@ -99,18 +99,30 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}

fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor<Item = &'a str>>(
fn character_length_general<'a, T: ArrowPrimitiveType, V: StringArrayType<'a>>(
array: V,
) -> Result<ArrayRef>
where
T::Native: OffsetSizeTrait,
{
let iter = ArrayIter::new(array);
// String characters are variable length encoded in UTF-8, counting the
// number of chars requires expensive decoding, however checking if the
// string is ASCII only is relatively cheap.
// If strings are ASCII only, count bytes instead.
let is_array_ascii_only = array.is_ascii();
let iter = array.iter();
let result = iter
.map(|string| {
string.map(|string: &str| {
T::Native::from_usize(string.chars().count())
.expect("should not fail as string.chars will always return integer")
if is_array_ascii_only {
T::Native::from_usize(string.len()).expect(
"should not fail as string.len will always return integer",
)
} else {
T::Native::from_usize(string.chars().count()).expect(
"should not fail as string.chars will always return integer",
)
}
})
})
.collect::<PrimitiveArray<T>>();
Expand Down

0 comments on commit eae8d7e

Please sign in to comment.