-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Feat: Support array flatten() on List(LargeList(_)) types
#18363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
e1166de
ac2fac4
0ddf926
85bd164
37ce386
cf6ade3
0057356
59f3fb8
26cdd4e
659b42a
468ca6b
b77497e
fc4cc61
faf1c15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,10 +20,12 @@ | |
| use crate::utils::make_scalar_function; | ||
| use arrow::array::{Array, ArrayRef, GenericListArray, OffsetSizeTrait}; | ||
| use arrow::buffer::OffsetBuffer; | ||
| use arrow::datatypes::ArrowNativeType; | ||
| use arrow::datatypes::{ | ||
| DataType, | ||
| DataType::{FixedSizeList, LargeList, List, Null}, | ||
| }; | ||
| use arrow::error::ArrowError; | ||
| use datafusion_common::cast::{as_large_list_array, as_list_array}; | ||
| use datafusion_common::{exec_err, utils::take_function_args, Result}; | ||
| use datafusion_expr::{ | ||
|
|
@@ -95,7 +97,9 @@ impl ScalarUDFImpl for Flatten { | |
| fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { | ||
| let data_type = match &arg_types[0] { | ||
| List(field) => match field.data_type() { | ||
| List(field) | FixedSizeList(field, _) => List(Arc::clone(field)), | ||
| List(field) | LargeList(field) | FixedSizeList(field, _) => { | ||
| List(Arc::clone(field)) | ||
| } | ||
| _ => arg_types[0].clone(), | ||
| }, | ||
| LargeList(field) => match field.data_type() { | ||
|
|
@@ -154,7 +158,32 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> { | |
| Ok(Arc::new(flattened_array) as ArrayRef) | ||
| } | ||
| LargeList(_) => { | ||
| exec_err!("flatten does not support type '{:?}'", array.data_type())? | ||
| let (inner_field, inner_offsets, inner_values, _) = | ||
| as_large_list_array(&values)?.clone().into_parts(); | ||
| // Try to downcast the inner offsets to i32 | ||
| match downcast_i64_inner_to_i32(&inner_offsets, &offsets) { | ||
| Ok(i32offsets) => { | ||
| let flattened_array = GenericListArray::<i32>::new( | ||
| inner_field, | ||
| i32offsets, | ||
| inner_values, | ||
| nulls, | ||
| ); | ||
| Ok(Arc::new(flattened_array) as ArrayRef) | ||
| } | ||
| // If downcast fails we keep the offsets as is | ||
| Err(_) => { | ||
| // Fallback: keep i64 offsets → LargeList<i64> | ||
| let i64offsets = keep_offsets_i64(inner_offsets, offsets); | ||
| let flattened_array = GenericListArray::<i64>::new( | ||
| inner_field, | ||
| i64offsets, | ||
| inner_values, | ||
| nulls, | ||
| ); | ||
| Ok(Arc::new(flattened_array) as ArrayRef) | ||
|
||
| } | ||
| } | ||
| } | ||
| _ => Ok(Arc::clone(array) as ArrayRef), | ||
| } | ||
|
|
@@ -179,7 +208,7 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> { | |
| Ok(Arc::new(flattened_array) as ArrayRef) | ||
| } | ||
| LargeList(_) => { | ||
| let (inner_field, inner_offsets, inner_values, nulls) = | ||
| let (inner_field, inner_offsets, inner_values, _) = // _ instead of nulls? | ||
|
||
| as_large_list_array(&values)?.clone().into_parts(); | ||
| let offsets = get_offsets_for_flatten::<i64>(inner_offsets, offsets); | ||
| let flattened_array = GenericListArray::<i64>::new( | ||
|
|
@@ -203,11 +232,11 @@ pub fn flatten_inner(args: &[ArrayRef]) -> Result<ArrayRef> { | |
|
|
||
| // Create new offsets that are equivalent to `flatten` the array. | ||
| fn get_offsets_for_flatten<O: OffsetSizeTrait>( | ||
| offsets: OffsetBuffer<O>, | ||
| indexes: OffsetBuffer<O>, | ||
| inner_offsets: OffsetBuffer<O>, | ||
| outer_offsets: OffsetBuffer<O>, | ||
|
||
| ) -> OffsetBuffer<O> { | ||
| let buffer = offsets.into_inner(); | ||
| let offsets: Vec<O> = indexes | ||
| let buffer = inner_offsets.into_inner(); | ||
| let offsets: Vec<O> = outer_offsets | ||
| .iter() | ||
| .map(|i| buffer[i.to_usize().unwrap()]) | ||
| .collect(); | ||
|
|
@@ -216,17 +245,47 @@ fn get_offsets_for_flatten<O: OffsetSizeTrait>( | |
|
|
||
| // Create new large offsets that are equivalent to `flatten` the array. | ||
| fn get_large_offsets_for_flatten<O: OffsetSizeTrait, P: OffsetSizeTrait>( | ||
| offsets: OffsetBuffer<O>, | ||
| indexes: OffsetBuffer<P>, | ||
| inner_offsets: OffsetBuffer<O>, | ||
| outer_offsets: OffsetBuffer<P>, | ||
| ) -> OffsetBuffer<i64> { | ||
| let buffer = offsets.into_inner(); | ||
| let offsets: Vec<i64> = indexes | ||
| let buffer = inner_offsets.into_inner(); | ||
| let offsets: Vec<i64> = outer_offsets | ||
| .iter() | ||
| .map(|i| buffer[i.to_usize().unwrap()].to_i64().unwrap()) | ||
| .collect(); | ||
| OffsetBuffer::new(offsets.into()) | ||
| } | ||
|
|
||
| // Function for converting LargeList offsets into List offsets | ||
| fn downcast_i64_inner_to_i32( | ||
| inner_offsets: &OffsetBuffer<i64>, | ||
| outer_offsets: &OffsetBuffer<i32>, | ||
| ) -> Result<OffsetBuffer<i32>, ArrowError> { | ||
| let buffer = inner_offsets.clone().into_inner(); | ||
| let offsets: Result<Vec<i32>, _> = outer_offsets | ||
| .iter() | ||
| .map(|i| buffer[i.to_usize().unwrap()]) | ||
| .map(|i| { | ||
| i32::try_from(i) | ||
| .map_err(|_| ArrowError::CastError(format!("Cannot downcast offset {i}"))) | ||
| }) | ||
| .collect(); | ||
| Ok(OffsetBuffer::new(offsets?.into())) | ||
| } | ||
|
||
|
|
||
| // In case the conversion fails we convert the outer offsets into i64 | ||
| fn keep_offsets_i64( | ||
|
||
| inner_offsets: OffsetBuffer<i64>, | ||
| outer_offsets: OffsetBuffer<i32>, | ||
| ) -> OffsetBuffer<i64> { | ||
| let buffer = inner_offsets.into_inner(); | ||
| let offsets: Vec<i64> = outer_offsets | ||
| .iter() | ||
| .map(|i| buffer[i.to_usize().unwrap()]) | ||
| .collect(); | ||
| OffsetBuffer::new(offsets.into()) | ||
| } | ||
|
|
||
| fn cast_fsl_to_list(array: ArrayRef) -> Result<ArrayRef> { | ||
| match array.data_type() { | ||
| FixedSizeList(field, _) => { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7737,6 +7737,12 @@ select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)')) | |
| ---- | ||
| [1, 2, 1, 3, 2] [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] | ||
|
|
||
| query ?? | ||
| select flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, LargeList(Int64))')), | ||
| flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'List(LargeList(FixedSizeList(1, Float64)))')); | ||
| ---- | ||
| [1, 2, 3, NULL, 4, NULL, 5] [[1.1], [2.2], [3.3], [4.4]] | ||
|
||
|
|
||
| # flatten with column values | ||
| query ???? | ||
| select flatten(column1), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently this only supports arrays that can be converted from
LargeListtoList.