From 6fe61b199c5814b9861dd80a6df2e2e14b758286 Mon Sep 17 00:00:00 2001 From: Tai Le Manh Date: Fri, 6 Sep 2024 09:39:36 +0700 Subject: [PATCH] Implement native support StringView for contains function Signed-off-by: Tai Le Manh --- datafusion/functions/Cargo.toml | 2 +- datafusion/functions/src/regex/mod.rs | 1 + .../functions/src/regex/regexp_common.rs | 121 ++++++++++++++++++ datafusion/functions/src/string/common.rs | 100 +-------------- datafusion/functions/src/string/contains.rs | 20 +-- 5 files changed, 135 insertions(+), 109 deletions(-) create mode 100644 datafusion/functions/src/regex/regexp_common.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index c201cff9d67e5..337379a746704 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -54,7 +54,7 @@ math_expressions = [] # enable regular expressions regex_expressions = ["regex"] # enable string functions -string_expressions = ["regex", "uuid"] +string_expressions = ["uuid"] # enable unicode functions unicode_expressions = ["hashbrown", "unicode-segmentation"] diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs index 4afbe6cbbb89c..1ff7f3ddb1b4e 100644 --- a/datafusion/functions/src/regex/mod.rs +++ b/datafusion/functions/src/regex/mod.rs @@ -17,6 +17,7 @@ //! "regex" DataFusion functions +pub mod regexp_common; pub mod regexplike; pub mod regexpmatch; pub mod regexpreplace; diff --git a/datafusion/functions/src/regex/regexp_common.rs b/datafusion/functions/src/regex/regexp_common.rs new file mode 100644 index 0000000000000..582ef639173eb --- /dev/null +++ b/datafusion/functions/src/regex/regexp_common.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common utilities for implementing regex functions + +use crate::string::common::StringArrayType; + +use arrow::array::{Array, ArrayDataBuilder, BooleanArray}; +use arrow::datatypes::DataType; +use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; +use datafusion_common::DataFusionError; +use regex::Regex; + +use std::collections::HashMap; + +#[cfg(doc)] +use arrow::array::{LargeStringArray, StringArray, StringViewArray}; +/// Perform SQL `array ~ regex_array` operation on +/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. +/// +/// If `regex_array` element has an empty value, the corresponding result value is always true. +/// +/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, +/// which allow special search modes, such as case-insensitive and multi-line mode. +/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) +/// for more information. +/// +/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. +/// +/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 +pub fn regexp_is_match_utf8<'a, S1, S2, S3>( + array: &'a S1, + regex_array: &'a S2, + flags_array: Option<&'a S3>, +) -> datafusion_common::Result +where + &'a S1: StringArrayType<'a>, + &'a S2: StringArrayType<'a>, + &'a S3: StringArrayType<'a>, +{ + if array.len() != regex_array.len() { + return Err(DataFusionError::Execution( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + + let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); + + let mut patterns: HashMap = HashMap::new(); + let mut result = BooleanBufferBuilder::new(array.len()); + + let complete_pattern = match flags_array { + Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( + |(pattern, flags)| { + pattern.map(|pattern| match flags { + Some(flag) => format!("(?{flag}){pattern}"), + None => pattern.to_string(), + }) + }, + )) as Box>>, + None => Box::new( + regex_array + .iter() + .map(|pattern| pattern.map(|pattern| pattern.to_string())), + ), + }; + + array + .iter() + .zip(complete_pattern) + .map(|(value, pattern)| { + match (value, pattern) { + (Some(_), Some(pattern)) if pattern == *"" => { + result.append(true); + } + (Some(value), Some(pattern)) => { + let existing_pattern = patterns.get(&pattern); + let re = match existing_pattern { + Some(re) => re, + None => { + let re = Regex::new(pattern.as_str()).map_err(|e| { + DataFusionError::Execution(format!( + "Regular expression did not compile: {e:?}" + )) + })?; + patterns.entry(pattern).or_insert(re) + } + }; + result.append(re.is_match(value)); + } + _ => result.append(false), + } + Ok(()) + }) + .collect::, DataFusionError>>()?; + + let data = unsafe { + ArrayDataBuilder::new(DataType::Boolean) + .len(array.len()) + .buffers(vec![result.into()]) + .nulls(nulls) + .build_unchecked() + }; + + Ok(BooleanArray::from(data)) +} diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 805e5a7f30b93..c8afd73163c3e 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -17,23 +17,20 @@ //! Common utilities for implementing string functions -use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::{ new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, - BooleanArray, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, + GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, StringBuilder, StringViewArray, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; -use arrow_buffer::BooleanBufferBuilder; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; +use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; -use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; -use regex::Regex; pub(crate) enum TrimType { Left, @@ -481,96 +478,3 @@ where GenericStringArray::::new_unchecked(offsets, values, nulls) })) } - -#[cfg(doc)] -use arrow::array::LargeStringArray; -/// Perform SQL `array ~ regex_array` operation on -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. -/// -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, -/// which allow special search modes, such as case-insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -/// -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. -/// -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 -pub fn regexp_is_match<'a, S1, S2, S3>( - array: &'a S1, - regex_array: &'a S2, - flags_array: Option<&'a S3>, -) -> Result -where - &'a S1: StringArrayType<'a>, - &'a S2: StringArrayType<'a>, - &'a S3: StringArrayType<'a>, -{ - if array.len() != regex_array.len() { - return Err(DataFusionError::Execution( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{flag}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - DataFusionError::Execution(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::, DataFusionError>>()?; - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(array.len()) - .buffers(vec![result.into()]) - .nulls(nulls) - .build_unchecked() - }; - - Ok(BooleanArray::from(data)) -} diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 8b80317696e91..bb98fb9816efd 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::string::common::regexp_is_match; +use crate::regex::regexp_common::regexp_is_match_utf8; use crate::utils::make_scalar_function; use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray}; @@ -92,7 +92,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8View) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string_view(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< StringViewArray, StringViewArray, GenericStringArray, @@ -103,7 +103,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< StringViewArray, GenericStringArray, GenericStringArray, @@ -114,7 +114,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, LargeUtf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< StringViewArray, GenericStringArray, GenericStringArray, @@ -125,7 +125,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, StringViewArray, GenericStringArray, @@ -136,7 +136,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -147,7 +147,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -158,7 +158,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, StringViewArray, GenericStringArray, @@ -169,7 +169,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -180,7 +180,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match::< + let res = regexp_is_match_utf8::< GenericStringArray, GenericStringArray, GenericStringArray,