Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
.unwrap();
}

fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
regexp_is_match_utf8_scalar(
criterion::black_box(arr_a),
criterion::black_box(value_b),
None,
)
.unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
Expand Down Expand Up @@ -195,6 +204,14 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("nlike_utf8 scalar complex", |b| {
b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xx_xx%xxx"))
});

c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
});

c.bench_function("egexp_matches_utf8 scalar ends with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});
}

criterion_group!(benches, add_benchmark);
Expand Down
244 changes: 244 additions & 0 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,136 @@ pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
Ok(BooleanArray::from(data))
}

/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
/// If `regex_array` element has an empty value, the corresponding result value is always true.
///
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow
/// special search modes, such as case insensitive and multi-line mode.
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

/// for more information.
pub fn regexp_is_match_utf8<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> Result<BooleanArray> {
if array.len() != regex_array.len() {
return Err(ArrowError::ComputeError(
"Cannot perform comparison operation on arrays of different length"
.to_string(),
));
}
let null_bit_buffer =
combine_option_bitmap(array.data_ref(), regex_array.data_ref(), array.len())?;

let mut patterns: HashMap<String, Regex> = HashMap::new();
let mut result = BooleanBufferBuilder::new(array.len());

let complete_pattern = match flags_array {
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
|(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(flag) => format!("(?{}){}", flag, pattern),
None => pattern.to_string(),
})
},
)) as Box<dyn Iterator<Item = Option<String>>>,
None => Box::new(
regex_array
.iter()
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
),
};

array
.iter()
.zip(complete_pattern)
.map(|(value, pattern)| {
match (value, pattern) {
// Required for Postgres compatibility:
// SELECT 'foobarbequebaz' ~ ''); = true
(Some(_), Some(pattern)) if pattern == *"" => {
result.append(true);
}
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re.clone(),
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {:?}",
e
))
})?;
patterns.insert(pattern, re.clone());
re
}
};
result.append(re.is_match(value));
}
_ => result.append(false),
}
Ok(())
})
.collect::<Result<Vec<()>>>()?;

let data = ArrayData::new(
DataType::Boolean,
array.len(),
None,
null_bit_buffer,
0,
vec![result.finish()],
vec![],
);
Ok(BooleanArray::from(data))
}

/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
/// [`LargeStringArray`] and a scalar.
///
/// See the documentation on [`regexp_is_match_utf8`] for more details.
pub fn regexp_is_match_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex: &str,
flag: Option<&str>,
) -> Result<BooleanArray> {
let null_bit_buffer = array.data().null_buffer().cloned();
let mut result = BooleanBufferBuilder::new(array.len());

let pattern = match flag {
Some(flag) => format!("(?{}){}", flag, regex),
None => regex.to_string(),
};
if pattern == *"" {
for _i in 0..array.len() {
result.append(true);
}
} else {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {:?}",
e
))
})?;
for i in 0..array.len() {
let value = array.value(i);
result.append(re.is_match(value));
}
}

let data = ArrayData::new(
DataType::Boolean,
array.len(),
None,
null_bit_buffer,
0,
vec![result.finish()],
vec![],
);
Ok(BooleanArray::from(data))
}

/// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`].
pub fn eq_utf8<OffsetSize: StringOffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
Expand Down Expand Up @@ -1438,6 +1568,82 @@ mod tests {
};
}

macro_rules! test_flag_utf8 {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
#[test]
fn $test_name() {
let left = StringArray::from($left);
let right = StringArray::from($right);
let res = $op(&left, &right, None).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
for i in 0..res.len() {
let v = res.value(i);
assert_eq!(v, expected[i]);
}
}
};
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
#[test]
fn $test_name() {
let left = StringArray::from($left);
let right = StringArray::from($right);
let flag = Some(StringArray::from($flag));
let res = $op(&left, &right, flag.as_ref()).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
for i in 0..res.len() {
let v = res.value(i);
assert_eq!(v, expected[i]);
}
}
};
}

macro_rules! test_flag_utf8_scalar {
($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
#[test]
fn $test_name() {
let left = StringArray::from($left);
let res = $op(&left, $right, None).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
for i in 0..res.len() {
let v = res.value(i);
assert_eq!(
v,
expected[i],
"unexpected result when comparing {} at position {} to {} ",
left.value(i),
i,
$right
);
}
}
};
($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
#[test]
fn $test_name() {
let left = StringArray::from($left);
let flag = Some($flag);
let res = $op(&left, $right, flag).unwrap();
let expected = $expected;
assert_eq!(expected.len(), res.len());
for i in 0..res.len() {
let v = res.value(i);
assert_eq!(
v,
expected[i],
"unexpected result when comparing {} at position {} to {} ",
left.value(i),
i,
$right
);
}
}
};
}

test_utf8!(
test_utf8_array_like,
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
Expand Down Expand Up @@ -1621,4 +1827,42 @@ mod tests {
gt_eq_utf8_scalar,
vec![false, false, true, true]
);
test_flag_utf8!(
test_utf8_array_regexp_is_match,
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
regexp_is_match_utf8,
vec![true, false, true, false, false, true]
);
test_flag_utf8!(
test_utf8_array_regexp_is_match_insensitive,
vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"],
vec!["^ar", "^AR", "ow$", "OW$", "foo", ""],
vec!["i"; 6],
regexp_is_match_utf8,
vec![true, true, true, true, false, true]
);

test_flag_utf8_scalar!(
test_utf8_array_regexp_is_match_scalar,
vec!["arrow", "ARROW", "parquet", "PARQUET"],
"^ar",
regexp_is_match_utf8_scalar,
vec![true, false, false, false]
);
test_flag_utf8_scalar!(
test_utf8_array_regexp_is_match_empty_scalar,
vec!["arrow", "ARROW", "parquet", "PARQUET"],
"",
regexp_is_match_utf8_scalar,
vec![true, true, true, true]
);
test_flag_utf8_scalar!(
test_utf8_array_regexp_is_match_insensitive_scalar,
vec!["arrow", "ARROW", "parquet", "PARQUET"],
"^ar",
"i",
regexp_is_match_utf8_scalar,
vec![true, true, false, false]
);
}