Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 3 additions & 19 deletions datafusion/spark/src/function/array/spark_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ use arrow::array::{
MutableArrayData, NullArray, OffsetSizeTrait,
};
use arrow::buffer::OffsetBuffer;
use arrow::datatypes::{DataType, Field, FieldRef};
use arrow::datatypes::{DataType, Field};
use datafusion_common::utils::SingleRowListArrayBuilder;
use datafusion_common::{plan_datafusion_err, plan_err, Result};
use datafusion_expr::type_coercion::binary::comparison_coercion;
use datafusion_expr::{
ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature,
TypeSignature, Volatility,
ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
Volatility,
};

use crate::function::functions_nested_utils::make_scalar_function;
Expand Down Expand Up @@ -92,21 +92,6 @@ impl ScalarUDFImpl for SparkArray {
))))
}

fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let data_types = args
.arg_fields
.iter()
.map(|f| f.data_type())
.cloned()
.collect::<Vec<_>>();
let return_type = self.return_type(&data_types)?;
Ok(Arc::new(Field::new(
"this_field_name_is_irrelevant",
return_type,
false,
)))
}
Comment on lines -95 to -108

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm actually thinking maybe we keep this and just fold return_type logic into here; specifically because it returns false for the nullability whereas default implementation for return_field_from_args returns true for nullability.

  • We'd still need to implement return_type to satisfy trait, but we can have it return an internal err, see:

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be used instead")
}
fn return_field_from_args(
&self,
args: datafusion_expr::ReturnFieldArgs,
) -> Result<FieldRef> {
let [map_type] = take_function_args(self.name(), args.arg_fields)?;
Ok(Field::new(
self.name(),
DataType::List(get_map_values_field_as_list_field(map_type.data_type())?),
// Nullable if the map is nullable
args.arg_fields.iter().any(|x| x.is_nullable()),
)
.into())
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks for pointing it out, will raise a revision.

For my learning, make_array implements return_type but not return_field_from_args,

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
which means it uses default implementation of return_field_from_args that has nullable set to true. How would nullability on the return field affect behavior of sql queries? Thanks.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with the degree to which DataFusion uses the nullability information of a field, so I can't say what the impact would be. But I assume it's better to be more informative where possible, e.g. we know make array function can't ever return a null so it's good if we can preserve that information, especially since the Spark version already had it.


fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
let ScalarFunctionArgs { args, .. } = args;
make_scalar_function(make_array_inner)(args.as_slice())
Expand Down Expand Up @@ -166,7 +151,6 @@ pub fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
.build_list_array(),
))
}
DataType::LargeList(..) => array_array::<i64>(arrays, data_type),
_ => array_array::<i32>(arrays, data_type),
}
}
Expand Down
15 changes: 15 additions & 0 deletions datafusion/sqllogictest/test_files/spark/array/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,18 @@ query ?
SELECT array(array(1,2));
----
[[1, 2]]

query ?
SELECT array(arrow_cast(array(1), 'LargeList(Int64)'));
----
[[1]]

query ?
SELECT array(arrow_cast(array(1), 'LargeList(Int64)'), arrow_cast(array(), 'LargeList(Int64)'));
----
[[1], []]

query ?
SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3));
----
[[1, 2], [3]]