Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ object FunctionRegistry {
expression[Tan]("tan"),
expression[Cot]("cot"),
expression[Tanh]("tanh"),
expression[WidthBucket]("width_bucket"),

expression[Add]("+"),
expression[Subtract]("-"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1325,3 +1325,99 @@ case class BRound(child: Expression, scale: Expression)
with Serializable with ImplicitCastInputTypes {
def this(child: Expression) = this(child, Literal(0))
}

object WidthBucket {

def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max ||
jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
jl.Double.isNaN(max) || jl.Double.isInfinite(max)) {
return null
}

val lower = Math.min(min, max)
val upper = Math.max(min, max)

if (min < max) {
if (value < lower) {
0L
} else if (value >= upper) {
numBucket + 1L
} else {
(numBucket.toDouble * (value - lower) / (upper - lower)).toLong + 1L
}
} else { // `min > max` case
if (value > upper) {
0L
} else if (value <= lower) {
numBucket + 1L
} else {
(numBucket.toDouble * (upper - value) / (upper - lower)).toLong + 1L
}
}
}
}

/**
* Returns the bucket number into which the value of this expression would fall
* after being evaluated. Note that input arguments must follow conditions listed below;
* otherwise, the method will return null.
* - `numBucket` must be greater than zero and be less than Long.MaxValue
* - `value`, `min`, and `max` cannot be NaN
* - `min` bound cannot equal `max`
* - `min` and `max` must be finite
*
* Note: If `minValue` > `maxValue`, a return value is as follows;
* if `value` > `minValue`, it returns 0.
* if `value` <= `maxValue`, it returns `numBucket` + 1.
* otherwise, it returns (`numBucket` * (`minValue` - `value`) / (`minValue` - `maxValue`)) + 1
*
* @param value is the expression to compute a bucket number in the histogram
* @param minValue is the minimum value of the histogram
* @param maxValue is the maximum value of the histogram
* @param numBucket is the number of buckets
*/
@ExpressionDescription(
usage = """
_FUNC_(value, min_value, max_value, num_bucket) - Returns the bucket number to which
`value` would be assigned in an equiwidth histogram with `num_bucket` buckets,
in the range `min_value` to `max_value`."
""",
examples = """
Examples:
> SELECT _FUNC_(5.3, 0.2, 10.6, 5);
3
> SELECT _FUNC_(-2.1, 1.3, 3.4, 3);
0
> SELECT _FUNC_(8.1, 0.0, 5.7, 4);
5
> SELECT _FUNC_(-0.9, 5.2, 0.5, 2);
3
""",
since = "3.1.0")
case class WidthBucket(
value: Expression,
minValue: Expression,
maxValue: Expression,
numBucket: Expression)
extends QuaternaryExpression with ImplicitCastInputTypes with NullIntolerant {

override def children: Seq[Expression] = Seq(value, minValue, maxValue, numBucket)
override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType, DoubleType, LongType)
override def dataType: DataType = LongType
override def nullable: Boolean = true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

override def prettyName: String = "width_bucket" ?


override protected def nullSafeEval(input: Any, min: Any, max: Any, numBucket: Any): Any = {
WidthBucket.computeBucketNumber(
input.asInstanceOf[Double],
min.asInstanceOf[Double],
max.asInstanceOf[Double],
numBucket.asInstanceOf[Long])
}

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
defineCodeGen(ctx, ev, (input, min, max, numBucket) =>
"org.apache.spark.sql.catalyst.expressions.WidthBucket" +
s".computeBucketNumber($input, $min, $max, $numBucket)")
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<!-- Automatically generated by ExpressionsSchemaSuite -->
## Summary
- Number of queries: 337
- Number of queries: 338
- Number of expressions that missing example: 34
- Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,struct,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch
## Schema of Built-in Functions
Expand Down Expand Up @@ -291,6 +291,7 @@
| org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct<uuid():string> |
| org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct<weekday(CAST(2009-07-30 AS DATE)):int> |
| org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct<weekofyear(CAST(2008-02-20 AS DATE)):int> |
| org.apache.spark.sql.catalyst.expressions.WidthBucket | width_bucket | SELECT width_bucket(5.3, 0.2, 10.6, 5) | struct<widthbucket(CAST(5.3 AS DOUBLE), CAST(0.2 AS DOUBLE), CAST(10.6 AS DOUBLE), CAST(5 AS BIGINT)):bigint> |
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output schema is widthbucket? It should be width_bucket if we define the prettyName to width_bucket?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea, it looks better. I'll fix it later.

| org.apache.spark.sql.catalyst.expressions.XxHash64 | xxhash64 | SELECT xxhash64('Spark', array(123), 2) | struct<xxhash64(Spark, array(123), 2):bigint> |
| org.apache.spark.sql.catalyst.expressions.Year | year | SELECT year('2016-07-30') | struct<year(CAST(2016-07-30 AS DATE)):int> |
| org.apache.spark.sql.catalyst.expressions.ZipWith | zip_with | SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)) | struct<zip_with(array(1, 2, 3), array(a, b, c), lambdafunction(named_struct(y, namedlambdavariable(), x, namedlambdavariable()), namedlambdavariable(), namedlambdavariable())):array<struct<y:string,x:int>>> |
Expand Down
14 changes: 14 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/operators.sql
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,17 @@ select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11);
-- pmod
select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null);
select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint));

-- width_bucket
select width_bucket(5.35, 0.024, 10.06, 5);
select width_bucket(5.35, 0.024, 10.06, 3 + 2);
select width_bucket('5.35', '0.024', '10.06', '5');
select width_bucket(5.35, 0.024, 10.06, 2.5);
select width_bucket(5.35, 0.024, 10.06, 0.5);
select width_bucket(null, 0.024, 10.06, 5);
select width_bucket(5.35, null, 10.06, 5);
select width_bucket(5.35, 0.024, null, -5);
select width_bucket(5.35, 0.024, 10.06, null);
select width_bucket(5.35, 0.024, 10.06, -5);
select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L); -- long max value
select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1);
76 changes: 49 additions & 27 deletions sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql
Original file line number Diff line number Diff line change
Expand Up @@ -754,22 +754,22 @@ DROP TABLE ceil_floor_round;
-- round((2.5 * 10 ^ i)::numeric, -i)
-- FROM generate_series(-5,5) AS t(i);

-- [SPARK-21117] Built-in SQL Function Support - WIDTH_BUCKET
-- Testing for width_bucket(). For convenience, we test both the
-- numeric and float8 versions of the function in this file.

-- errors
-- SELECT width_bucket(5.0, 3.0, 4.0, 0);
-- SELECT width_bucket(5.0, 3.0, 4.0, -5);
-- SELECT width_bucket(3.5, 3.0, 3.0, 888);
-- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0);
-- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5);
-- SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888);
-- SELECT width_bucket('NaN', 3.0, 4.0, 888);
-- SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
SELECT width_bucket(5.0, 3.0, 4.0, 0);
SELECT width_bucket(5.0, 3.0, 4.0, -5);
SELECT width_bucket(3.5, 3.0, 3.0, 888);
SELECT width_bucket(double(5.0), double(3.0), double(4.0), 0);
SELECT width_bucket(double(5.0), double(3.0), double(4.0), -5);
SELECT width_bucket(double(3.5), double(3.0), double(3.0), 888);
SELECT width_bucket('NaN', 3.0, 4.0, 888);
SELECT width_bucket(double(0), 'NaN', double(4.0), 888);

-- normal operation
-- CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
CREATE TABLE width_bucket_test (operand_num decimal(30,15), operand_f8 double) USING parquet;

-- COPY width_bucket_test (operand_num) FROM stdin;
-- -5.2
Expand All @@ -795,28 +795,50 @@ DROP TABLE ceil_floor_round;

-- UPDATE width_bucket_test SET operand_f8 = operand_num::float8;

-- SELECT
-- operand_num,
-- width_bucket(operand_num, 0, 10, 5) AS wb_1,
-- width_bucket(operand_f8, 0, 10, 5) AS wb_1f,
-- width_bucket(operand_num, 10, 0, 5) AS wb_2,
-- width_bucket(operand_f8, 10, 0, 5) AS wb_2f,
-- width_bucket(operand_num, 2, 8, 4) AS wb_3,
-- width_bucket(operand_f8, 2, 8, 4) AS wb_3f,
-- width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4,
-- width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f,
-- width_bucket(operand_num, -25, 25, 10) AS wb_5,
-- width_bucket(operand_f8, -25, 25, 10) AS wb_5f
-- FROM width_bucket_test;
INSERT INTO width_bucket_test VALUES
(-5.2, -5.2),
(-0.0000000001, -0.0000000001),
(0.000000000001, 0.000000000001),
(1, 1),
(1.99999999999999, 1.99999999999999),
(2, 2),
(2.00000000000001, 2.00000000000001),
(3, 3),
(4, 4),
(4.5, 4.5),
(5, 5),
(5.5, 5.5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(9.99999999999999, 9.99999999999999),
(10, 10),
(10.0000000000001, 10.0000000000001);

SELECT
operand_num,
width_bucket(operand_num, 0, 10, 5) AS wb_1,
width_bucket(operand_f8, 0, 10, 5) AS wb_1f,
width_bucket(operand_num, 10, 0, 5) AS wb_2,
width_bucket(operand_f8, 10, 0, 5) AS wb_2f,
width_bucket(operand_num, 2, 8, 4) AS wb_3,
width_bucket(operand_f8, 2, 8, 4) AS wb_3f,
width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4,
width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f,
width_bucket(operand_num, -25, 25, 10) AS wb_5,
width_bucket(operand_f8, -25, 25, 10) AS wb_5f
FROM width_bucket_test
ORDER BY operand_num ASC;

-- for float8 only, check positive and negative infinity: we require
-- finite bucket bounds, but allow an infinite operand
-- SELECT width_bucket(0.0::float8, 'Infinity'::float8, 5, 10); -- error
-- SELECT width_bucket(0.0::float8, 5, '-Infinity'::float8, 20); -- error
-- SELECT width_bucket('Infinity'::float8, 1, 10, 10),
-- width_bucket('-Infinity'::float8, 1, 10, 10);
SELECT width_bucket(double(0.0), double('Infinity'), 5, 10); -- error
SELECT width_bucket(double(0.0), 5, double('-Infinity'), 20); -- error
SELECT width_bucket(double('Infinity'), 1, 10, 10),
width_bucket(double('-Infinity'), 1, 10, 10);

-- DROP TABLE width_bucket_test;
DROP TABLE width_bucket_test;

-- [SPARK-28137] Missing Data Type Formatting Functions: TO_CHAR
-- TO_CHAR()
Expand Down
98 changes: 97 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/operators.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 57
-- Number of queries: 69


-- !query
Expand Down Expand Up @@ -456,3 +456,99 @@ select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint)
struct<pmod(CAST(3.13 AS DECIMAL(10,0)), CAST(0 AS DECIMAL(10,0))):decimal(10,0),pmod(CAST(2 AS SMALLINT), CAST(0 AS SMALLINT)):smallint>
-- !query output
NULL NULL


-- !query
select width_bucket(5.35, 0.024, 10.06, 5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
-- !query output
3


-- !query
select width_bucket(5.35, 0.024, 10.06, 3 + 2)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST((3 + 2) AS BIGINT)):bigint>
-- !query output
3


-- !query
select width_bucket('5.35', '0.024', '10.06', '5')
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
-- !query output
3


-- !query
select width_bucket(5.35, 0.024, 10.06, 2.5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(2.5 AS BIGINT)):bigint>
-- !query output
2


-- !query
select width_bucket(5.35, 0.024, 10.06, 0.5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(0.5 AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(null, 0.024, 10.06, 5)
-- !query schema
struct<widthbucket(CAST(NULL AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, null, 10.06, 5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(NULL AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, 0.024, null, -5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(NULL AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, 0.024, 10.06, null)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(NULL AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, 0.024, 10.06, -5)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), 9223372036854775807):bigint>
-- !query output
NULL


-- !query
select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1)
-- !query schema
struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), (9223372036854775807 - CAST(1 AS BIGINT))):bigint>
-- !query output
4894746858139549697
Loading