Skip to content
111 changes: 111 additions & 0 deletions datafusion/spark/src/function/math/ceil.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::any::Any;
use std::sync::Arc;

use arrow::array::{Array, ArrayRef, AsArray};
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{Float32, Float64, Int64};
use datafusion_common::{exec_err, Result};
use datafusion_expr::Signature;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Volatility};
use datafusion_functions::utils::make_scalar_function;

/// <https://spark.apache.org/docs/latest/api/sql/index.html#ceil>
/// Difference between spark: There is no second optional argument to control the rounding behaviour.
Comment thread
Jefffrey marked this conversation as resolved.
Outdated
/// Takes an Int64/Float32/Float64 input and returns the smallest number after rounding up that is
/// not smaller than the input.
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct SparkCeil {
signature: Signature,
aliases: Vec<String>,
Comment thread
codetyri0n marked this conversation as resolved.
Outdated
}

impl Default for SparkCeil {
fn default() -> Self {
Self::new()
}
}

impl SparkCeil {
pub fn new() -> Self {
Self {
signature: Signature::numeric(1, Volatility::Immutable),
Comment thread
codetyri0n marked this conversation as resolved.
Outdated
aliases: vec![],
}
}
}

impl ScalarUDFImpl for SparkCeil {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"ceil"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(Int64)
Comment thread
codetyri0n marked this conversation as resolved.
Outdated
}

fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
make_scalar_function(spark_ceil, vec![])(&args.args)
}

fn aliases(&self) -> &[String] {
&self.aliases
}
}
Comment on lines +132 to +147

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs refactoring; it has an unused parameter _scale and also the way it calculates scale is odd: it doesn't need an if/else branch if its always going to clamp to 0 anyway 🤔

Also the amount of comments here compared to whats being done here is overkill 🙁

(It feels like LLM comments)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just wanted to be as descriptive as possible 😅 - since there are larger changes to the existing code. (On a side note - there is agent usage here however for writing the code)


pub fn spark_ceil(args: &[ArrayRef]) -> Result<ArrayRef> {
Comment thread
codetyri0n marked this conversation as resolved.
Outdated
if args.len() != 1 {
return exec_err!("ceil expects exactly 1 argument, got {}", args.len());
}

let array: &dyn Array = args[0].as_ref();
match args[0].data_type() {
Float32 => {
let array = array
.as_primitive::<arrow::datatypes::Float32Type>()
.unary::<_, arrow::datatypes::Int64Type>(|value: f32| {
value.ceil() as i64
});
Ok(Arc::new(array))
}
Float64 => {
let array = array
.as_primitive::<arrow::datatypes::Float64Type>()
.unary::<_, arrow::datatypes::Int64Type>(|value: f64| {
value.ceil() as i64
});
Ok(Arc::new(array))
}
Int64 => Ok(Arc::clone(&args[0])),
_ => {
Comment thread
codetyri0n marked this conversation as resolved.
Outdated
exec_err!(
"ceil expects a numeric argument, got {}",
args[0].data_type()
)
}
}
}
8 changes: 8 additions & 0 deletions datafusion/spark/src/function/math/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

pub mod ceil;
pub mod expm1;
pub mod factorial;
pub mod hex;
Expand All @@ -33,6 +34,7 @@ make_udf_function!(modulus::SparkMod, modulus);
make_udf_function!(modulus::SparkPmod, pmod);
make_udf_function!(rint::SparkRint, rint);
make_udf_function!(width_bucket::SparkWidthBucket, width_bucket);
make_udf_function!(ceil::SparkCeil, ceil);

pub mod expr_fn {
use datafusion_functions::export_functions;
Expand All @@ -48,6 +50,11 @@ pub mod expr_fn {
export_functions!((pmod, "Returns the positive remainder of division of the first argument by the second argument.", arg1 arg2));
export_functions!((rint, "Returns the double value that is closest in value to the argument and is equal to a mathematical integer.", arg1));
export_functions!((width_bucket, "Returns the bucket number into which the value of this expression would fall after being evaluated.", arg1 arg2 arg3 arg4));
export_functions!((
ceil,
"Returns the smallest whole number that is greater than the input value.",
arg1
));
}

pub fn functions() -> Vec<Arc<ScalarUDF>> {
Expand All @@ -59,5 +66,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
pmod(),
rint(),
width_bucket(),
ceil(),
]
}
184 changes: 159 additions & 25 deletions datafusion/sqllogictest/test_files/spark/math/ceil.slt
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,162 @@
# specific language governing permissions and limitations
# under the License.

# This file was originally created by a porting script from:
# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
# This file is part of the implementation of the datafusion-spark function library.
# For more information, please see:
# https://github.com/apache/datafusion/issues/15914

## Original Query: SELECT ceil(-0.1);
## PySpark 3.5.5 Result: {'CEIL(-0.1)': Decimal('0'), 'typeof(CEIL(-0.1))': 'decimal(1,0)', 'typeof(-0.1)': 'decimal(1,1)'}
#query
#SELECT ceil(-0.1::decimal(1,1));

## Original Query: SELECT ceil(3.1411, -3);
## PySpark 3.5.5 Result: {'ceil(3.1411, -3)': Decimal('1000'), 'typeof(ceil(3.1411, -3))': 'decimal(4,0)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(-3)': 'int'}
#query
#SELECT ceil(3.1411::decimal(5,4), -3::int);

## Original Query: SELECT ceil(3.1411, 3);
## PySpark 3.5.5 Result: {'ceil(3.1411, 3)': Decimal('3.142'), 'typeof(ceil(3.1411, 3))': 'decimal(5,3)', 'typeof(3.1411)': 'decimal(5,4)', 'typeof(3)': 'int'}
#query
#SELECT ceil(3.1411::decimal(5,4), 3::int);

## Original Query: SELECT ceil(5);
## PySpark 3.5.5 Result: {'CEIL(5)': 5, 'typeof(CEIL(5))': 'bigint', 'typeof(5)': 'int'}
#query
#SELECT ceil(5::int);
# Test cases for the ceil function

# Test basic positive float values
query I
SELECT ceil(3.14::float);
----
4

query I
SELECT ceil(2.1::float);
----
3

query I
SELECT ceil(5.9::float);
----
6

# Test basic positive double values
query I
SELECT ceil(3.14159);
----
4

query I
SELECT ceil(2.1);
----
3

query I
SELECT ceil(5.999);
----
6

# Test negative float values
query I
SELECT ceil(-3.14::float);
----
-3

query I
SELECT ceil(-2.1::float);
----
-2

query I
SELECT ceil(-5.9::float);
----
-5

# Test negative double values
query I
SELECT ceil(-3.14159);
----
-3

query I
SELECT ceil(-2.1);
----
-2

query I
SELECT ceil(-5.999);
----
-5

# Test integer values
query I
SELECT ceil(52);
----
52

query I
SELECT ceil(-39);
----
-39

# Test zero values
query I
SELECT ceil(0.0);
----
0

query I
SELECT ceil(0.0::float);
----
0

query I
SELECT ceil(-3.0::float);
----
-3

# Test very small decimal values
query I
SELECT ceil(0.0001);
----
1

query I
SELECT ceil(-0.0001);
----
0

# Test large numbers
query I
SELECT ceil(123456.789);
----
123457

query I
SELECT ceil(-98765.432);
----
-98765

# Test with NULL values
query I
SELECT ceil(NULL);
----
NULL

# Test array inputs
query I
SELECT ceil(column1) FROM (VALUES (1.5), (2.7), (-1.3), (NULL)) AS t(column1);
----
2
3
-1
NULL

query I
SELECT ceil(column1::float) FROM (VALUES (1.5), (2.7), (-1.3), (NULL)) AS t(column1);
----
2
3
-1
NULL

# Test edge cases
query I
SELECT ceil(0.999999999999);
----
1

query I
SELECT ceil(-0.999999999999);
----
0

# Test boundary values
query I
SELECT ceil(9223372036854775807.0);
----
9223372036854775807

query I
SELECT ceil(-9223372036854775808.0);
----
-9223372036854775808