-
Notifications
You must be signed in to change notification settings - Fork 2.2k
feat: support binary arguments for StringConcat operator #21883
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5d46932
c539d00
905571c
5629996
6dc7a22
46bfc74
77768b9
5469c45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -897,3 +897,74 @@ fn test_binary_comparison_string_numeric_coercion() -> Result<()> { | |
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_string_concat_coercion() -> Result<()> { | ||
| // Binary | ||
| test_coercion_binary_rule!( | ||
| DataType::Binary, | ||
| DataType::Binary, | ||
| Operator::StringConcat, | ||
| DataType::Binary | ||
| ); | ||
| test_coercion_binary_rule!( | ||
| DataType::LargeBinary, | ||
| DataType::LargeBinary, | ||
| Operator::StringConcat, | ||
| DataType::LargeBinary | ||
| ); | ||
| test_coercion_binary_rule!( | ||
| DataType::BinaryView, | ||
| DataType::BinaryView, | ||
| Operator::StringConcat, | ||
| DataType::BinaryView | ||
| ); | ||
|
|
||
| // String | ||
| test_coercion_binary_rule!( | ||
| DataType::Utf8, | ||
| DataType::Utf8, | ||
| Operator::StringConcat, | ||
| DataType::Utf8 | ||
| ); | ||
| test_coercion_binary_rule!( | ||
| DataType::LargeUtf8, | ||
| DataType::LargeUtf8, | ||
| Operator::StringConcat, | ||
| DataType::LargeUtf8 | ||
| ); | ||
| test_coercion_binary_rule!( | ||
| DataType::Utf8View, | ||
| DataType::Utf8View, | ||
| Operator::StringConcat, | ||
| DataType::Utf8View | ||
| ); | ||
|
|
||
| // Mixed string-binary | ||
| for dt in [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] { | ||
| assert!( | ||
| BinaryTypeCoercer::new(&DataType::Binary, &Operator::StringConcat, &dt,) | ||
| .get_input_types() | ||
| .is_err(), | ||
| "{}", | ||
| dt | ||
| ); | ||
| assert!( | ||
| BinaryTypeCoercer::new(&dt, &Operator::StringConcat, &DataType::Binary,) | ||
| .get_input_types() | ||
| .is_err(), | ||
| "{}", | ||
| dt | ||
| ); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it only loops over string types paired with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. By the way, it tests only coercion, the actual operation concat, as Jeffrey spotted for LargeBinary, is done via slt |
||
| } | ||
|
|
||
| // Mixed string-other | ||
| test_coercion_binary_rule!( | ||
| DataType::Utf8, | ||
| DataType::Timestamp(Second, None), | ||
| Operator::StringConcat, | ||
| DataType::Utf8 | ||
| ); | ||
|
|
||
| Ok(()) | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |||||||||||||||||
| //! This module contains computation kernels that are specific to | ||||||||||||||||||
| //! datafusion and not (yet) targeted to port upstream to arrow | ||||||||||||||||||
| use arrow::array::*; | ||||||||||||||||||
| use arrow::buffer::NullBuffer; | ||||||||||||||||||
| use arrow::buffer::{MutableBuffer, NullBuffer}; | ||||||||||||||||||
| use arrow::compute::kernels::bitwise::{ | ||||||||||||||||||
| bitwise_and, bitwise_and_scalar, bitwise_or, bitwise_or_scalar, bitwise_shift_left, | ||||||||||||||||||
| bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar, | ||||||||||||||||||
|
|
@@ -161,11 +161,11 @@ create_left_integral_dyn_scalar_kernel!( | |||||||||||||||||
| bitwise_shift_left_scalar | ||||||||||||||||||
| ); | ||||||||||||||||||
|
|
||||||||||||||||||
| /// Concatenates two `StringViewArray`s element-wise. | ||||||||||||||||||
| /// Concatenates two `StringViewArray`s element-wise. | ||||||||||||||||||
| /// If either element is `Null`, the result element is also `Null`. | ||||||||||||||||||
| /// | ||||||||||||||||||
| /// # Errors | ||||||||||||||||||
| /// - Returns an error if the input arrays have different lengths. | ||||||||||||||||||
| /// - Returns an error if the input arrays have different lengths. | ||||||||||||||||||
| /// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length. | ||||||||||||||||||
| pub fn concat_elements_utf8view( | ||||||||||||||||||
| left: &StringViewArray, | ||||||||||||||||||
|
|
@@ -204,6 +204,95 @@ pub fn concat_elements_utf8view( | |||||||||||||||||
| Ok(result.finish()) | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| /// Concatenates two `GenericBinaryArray`s element-wise. | ||||||||||||||||||
| /// If either element is `Null`, the result element is also `Null`. | ||||||||||||||||||
| /// | ||||||||||||||||||
| /// # Errors | ||||||||||||||||||
| /// - Returns an error if the input arrays have different lengths. | ||||||||||||||||||
| /// - Panics if any concatenated string exceeds `T::Offset::MAX` in length. | ||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shoudl we move it to Panic section? also the kernel below
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This went away |
||||||||||||||||||
| pub fn concat_elements_binary_array<T: OffsetSizeTrait>( | ||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can follow what we do for strings and just use datafusion/datafusion/physical-expr/src/expressions/binary.rs Lines 944 to 951 in 22bb4e6
https://docs.rs/arrow/latest/arrow/compute/kernels/concat_elements/fn.concat_element_binary.html
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for pointing that out. I am reinventing the wheel. Changed to these implementations, leaving only view concats. Is there any reason we don't have StringViewArray/BinaryViewArray/GenericByteViewArray in arrow-string? PR if so?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No particular reason I think, just haven't added support for it
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. If you don't mind, I will add it |
||||||||||||||||||
| left: &GenericBinaryArray<T>, | ||||||||||||||||||
| right: &GenericBinaryArray<T>, | ||||||||||||||||||
| ) -> std::result::Result<GenericBinaryArray<T>, ArrowError> { | ||||||||||||||||||
| if left.len() != right.len() { | ||||||||||||||||||
| return Err(ArrowError::ComputeError(format!( | ||||||||||||||||||
| "Arrays must have the same length: {} != {}", | ||||||||||||||||||
| left.len(), | ||||||||||||||||||
| right.len() | ||||||||||||||||||
| ))); | ||||||||||||||||||
| } | ||||||||||||||||||
| // data capacity is unknown, so pass zero | ||||||||||||||||||
| let mut result = GenericBinaryBuilder::<T>::with_capacity(left.len(), 0); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Avoid reallocations by writing to a reused buffer (note we could be even | ||||||||||||||||||
| // more efficient by creating the view directly here and avoid the buffer | ||||||||||||||||||
| // but that would be more complex) | ||||||||||||||||||
| let mut buffer = MutableBuffer::new(0); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Pre-compute combined null bitmap, so the per-row NULL check is more | ||||||||||||||||||
| // efficient | ||||||||||||||||||
| let nulls = NullBuffer::union(left.nulls(), right.nulls()); | ||||||||||||||||||
|
|
||||||||||||||||||
| for i in 0..left.len() { | ||||||||||||||||||
| if nulls.as_ref().is_some_and(|n| n.is_null(i)) { | ||||||||||||||||||
| result.append_null(); | ||||||||||||||||||
| } else { | ||||||||||||||||||
| let l = left.value(i); | ||||||||||||||||||
| let r = right.value(i); | ||||||||||||||||||
| buffer.clear(); | ||||||||||||||||||
| buffer.extend_from_slice(l); | ||||||||||||||||||
| buffer.extend_from_slice(r); | ||||||||||||||||||
| // No try-version of append_value because it panics on overflow | ||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we could improve the comment
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This went away |
||||||||||||||||||
| result.append_value(&buffer); | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| Ok(result.finish()) | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| /// Concatenates two `BinaryViewArray`s element-wise. | ||||||||||||||||||
| /// If either element is `Null`, the result element is also `Null`. | ||||||||||||||||||
| /// | ||||||||||||||||||
| /// # Errors | ||||||||||||||||||
| /// - Returns an error if the input arrays have different lengths. | ||||||||||||||||||
| /// - Returns an error if any concatenated string exceeds `u32::MAX` in length. | ||||||||||||||||||
| pub fn concat_elements_binary_view_array( | ||||||||||||||||||
| left: &BinaryViewArray, | ||||||||||||||||||
| right: &BinaryViewArray, | ||||||||||||||||||
| ) -> std::result::Result<BinaryViewArray, ArrowError> { | ||||||||||||||||||
| if left.len() != right.len() { | ||||||||||||||||||
| return Err(ArrowError::ComputeError(format!( | ||||||||||||||||||
| "Arrays must have the same length: {} != {}", | ||||||||||||||||||
| left.len(), | ||||||||||||||||||
| right.len() | ||||||||||||||||||
| ))); | ||||||||||||||||||
| } | ||||||||||||||||||
| let mut result = BinaryViewBuilder::with_capacity(left.len()); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Avoid reallocations by writing to a reused buffer (note we could be even | ||||||||||||||||||
| // more efficient by creating the view directly here and avoid the buffer | ||||||||||||||||||
| // but that would be more complex) | ||||||||||||||||||
| let mut buffer = MutableBuffer::new(0); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Pre-compute combined null bitmap, so the per-row NULL check is more | ||||||||||||||||||
| // efficient | ||||||||||||||||||
| let nulls = NullBuffer::union(left.nulls(), right.nulls()); | ||||||||||||||||||
|
|
||||||||||||||||||
| for i in 0..left.len() { | ||||||||||||||||||
| if nulls.as_ref().is_some_and(|n| n.is_null(i)) { | ||||||||||||||||||
| result.append_null(); | ||||||||||||||||||
| } else { | ||||||||||||||||||
| let l = left.value(i); | ||||||||||||||||||
| let r = right.value(i); | ||||||||||||||||||
| buffer.clear(); | ||||||||||||||||||
| buffer.extend_from_slice(l); | ||||||||||||||||||
| buffer.extend_from_slice(r); | ||||||||||||||||||
| // No try-version of append_value | ||||||||||||||||||
| result.try_append_value(&buffer)?; | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| Ok(result.finish()) | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| /// Invoke a compute kernel on a pair of binary data arrays with flags | ||||||||||||||||||
| macro_rules! regexp_is_match_flag { | ||||||||||||||||||
| ($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{ | ||||||||||||||||||
|
|
||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -321,3 +321,20 @@ query T | |
| SELECT split_part(CAST(binary AS VARCHAR), 'o', 2) FROM t WHERE binary = X'466f6f'; | ||
| ---- | ||
| (empty) | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add a test with fixedsizebinary too?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, added support for this type as well |
||
| # Pipe concatenation of binaries always provides a binary | ||
| query ? | ||
| SELECT x'636166c3a9' || x'68656c6c6f'; | ||
| ---- | ||
| 636166c3a968656c6c6f | ||
|
|
||
| # Byte pipe operator is forbidden for mixed binary and text | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about mixed binary? 1. query failed: DataFusion error: Error during planning: Cannot infer common string type for string concat operation Binary || LargeBinary
[SQL] SELECT x'636166c3a9' || arrow_cast(x'68656c6c6f', 'LargeBinary');
at /Users/jeffrey/Code/datafusion/datafusion/sqllogictest/test_files/binary.slt:331Is this intentional?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I slipped it, thanks!. Added unit and SLT tests |
||
| query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Binary || Utf8 | ||
| SELECT x'c3a9' || 'hello'; | ||
|
|
||
| query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || LargeBinary | ||
| SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'LargeBinary'); | ||
|
|
||
| query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || BinaryView | ||
| SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'BinaryView'); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,3 +135,25 @@ query T | |
| SELECT concat(x'636166c3', x'a968656c6c6f'); | ||
| ---- | ||
| caféhello | ||
|
|
||
| # UDF concatenation for valid UTF-8 arguments | ||
|
Jefffrey marked this conversation as resolved.
Outdated
|
||
| query T | ||
| SELECT concat(x'636166c3a9', x'68656c6c6f'); | ||
| ---- | ||
| caféhello | ||
|
|
||
| # concat UDF allows invalid UTF-8 arguments, so it allows a valid UTF-8 sequence after concatenation | ||
| query T | ||
| SELECT concat(x'c3', x'a9'); | ||
| ---- | ||
| é | ||
|
|
||
| # concat UDF cannot form a valid UTF-8 sequence | ||
| query error Execution error: invalid UTF-8 in binary literal | ||
| SELECT concat(x'ff', x'af'); | ||
|
|
||
| # Mixed binary and text provide actual UTF-8 sequence, not '\xc3a9' as in PostgreSQL | ||
| query T | ||
| SELECT concat(x'c3a9', 'hello'); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| ---- | ||
| éhello | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before the change,
Binary || Utf8returned aUtf8string, andBinary || Binaryalso returnedUtf8. Now the first errors out, and the second returnsBinary, wouldn't this be a breaking change for users who are hitting this in a query?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is worth noting, as we decided to ban mixing according to this analysis.
I'll add
api changelabel and put an upgrade notice shortly.I intend to put a separate PR to harmonise
concatUDF behaviour (it allows mixing) with the pipe operator's behaviour to avoid confusion