Skip to content

Commit 0a0ccb1

Browse files
authored
Feat: Support array flatten() on List(LargeList(_)) types (#18363)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #17670 - Closes #18419 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? Added a `flatten()` `List(LargeList)` test to the `sqllogictest` Added support for array `flatten()` on `List(LargeList(_))` types <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? `sqllogictest` passes, but I still need to implement a test where offsets could not be downcasted from i64 to i32 <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? Users will be able to use `flatten` on `List(LargeList)` types <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 9ea67f5 commit 0a0ccb1

File tree

2 files changed

+35
-14
lines changed

2 files changed

+35
-14
lines changed

datafusion/functions-nested/src/flatten.rs

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ impl ScalarUDFImpl for Flatten {
9696
let data_type = match &arg_types[0] {
9797
List(field) => match field.data_type() {
9898
List(field) | FixedSizeList(field, _) => List(Arc::clone(field)),
99+
LargeList(field) => LargeList(Arc::clone(field)),
99100
_ => arg_types[0].clone(),
100101
},
101102
LargeList(field) => match field.data_type() {
@@ -143,7 +144,8 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
143144
List(_) => {
144145
let (inner_field, inner_offsets, inner_values, _) =
145146
as_list_array(&values)?.clone().into_parts();
146-
let offsets = get_offsets_for_flatten::<i32>(inner_offsets, offsets);
147+
let offsets =
148+
get_offsets_for_flatten::<i32, i32>(inner_offsets, offsets);
147149
let flattened_array = GenericListArray::<i32>::new(
148150
inner_field,
149151
offsets,
@@ -154,7 +156,17 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
154156
Ok(Arc::new(flattened_array) as ArrayRef)
155157
}
156158
LargeList(_) => {
157-
exec_err!("flatten does not support type '{:?}'", array.data_type())?
159+
let (inner_field, inner_offsets, inner_values, _) =
160+
as_large_list_array(&values)?.clone().into_parts();
161+
let offsets =
162+
get_offsets_for_flatten::<i64, i32>(inner_offsets, offsets);
163+
let flattened_array = GenericListArray::<i64>::new(
164+
inner_field,
165+
offsets,
166+
inner_values,
167+
nulls,
168+
);
169+
Ok(Arc::new(flattened_array) as ArrayRef)
158170
}
159171
_ => Ok(Arc::clone(array) as ArrayRef),
160172
}
@@ -179,9 +191,10 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
179191
Ok(Arc::new(flattened_array) as ArrayRef)
180192
}
181193
LargeList(_) => {
182-
let (inner_field, inner_offsets, inner_values, nulls) =
194+
let (inner_field, inner_offsets, inner_values, _) =
183195
as_large_list_array(&values)?.clone().into_parts();
184-
let offsets = get_offsets_for_flatten::<i64>(inner_offsets, offsets);
196+
let offsets =
197+
get_offsets_for_flatten::<i64, i64>(inner_offsets, offsets);
185198
let flattened_array = GenericListArray::<i64>::new(
186199
inner_field,
187200
offsets,
@@ -202,12 +215,12 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
202215
}
203216

204217
// Create new offsets that are equivalent to `flatten` the array.
205-
fn get_offsets_for_flatten<O: OffsetSizeTrait>(
206-
offsets: OffsetBuffer<O>,
207-
indexes: OffsetBuffer<O>,
218+
fn get_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
219+
inner_offsets: OffsetBuffer<O>,
220+
outer_offsets: OffsetBuffer<P>,
208221
) -> OffsetBuffer<O> {
209-
let buffer = offsets.into_inner();
210-
let offsets: Vec<O> = indexes
222+
let buffer = inner_offsets.into_inner();
223+
let offsets: Vec<O> = outer_offsets
211224
.iter()
212225
.map(|i| buffer[i.to_usize().unwrap()])
213226
.collect();
@@ -216,11 +229,11 @@ fn get_offsets_for_flatten<O: OffsetSizeTrait>(
216229

217230
// Create new large offsets that are equivalent to `flatten` the array.
218231
fn get_large_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>(
219-
offsets: OffsetBuffer<O>,
220-
indexes: OffsetBuffer<P>,
232+
inner_offsets: OffsetBuffer<O>,
233+
outer_offsets: OffsetBuffer<P>,
221234
) -> OffsetBuffer<i64> {
222-
let buffer = offsets.into_inner();
223-
let offsets: Vec<i64> = indexes
235+
let buffer = inner_offsets.into_inner();
236+
let offsets: Vec<i64> = outer_offsets
224237
.iter()
225238
.map(|i| buffer[i.to_usize().unwrap()].to_i64().unwrap())
226239
.collect();

datafusion/sqllogictest/test_files/array.slt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7757,7 +7757,7 @@ select flatten(make_array(1, 2, 1, 3, 2)),
77577757

77587758
query ???
77597759
select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'LargeList(Int64)')),
7760-
flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
7760+
flatten(arrow_cast(make_array([1], null, [2, 3], [null], make_array(4, null, 5)), 'LargeList(LargeList(Int64))')),
77617761
flatten(arrow_cast(make_array([[1.1]], [[2.2]], [[3.3], [4.4]]), 'LargeList(LargeList(LargeList(Float64)))'));
77627762
----
77637763
[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
@@ -7769,6 +7769,14 @@ select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)'))
77697769
----
77707770
[1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]]
77717771

7772+
query ??TT
7773+
select flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))')),
7774+
flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')),
7775+
arrow_typeof(flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))'))),
7776+
arrow_typeof(flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')));
7777+
----
7778+
[1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] LargeList(nullable Int64) LargeList(nullable FixedSizeList(1 x nullable Float64))
7779+
77727780
# flatten with column values
77737781
query ????
77747782
select flatten(column1),

0 commit comments

Comments
 (0)