diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index e2559d4c07297..3989df5d29467 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -274,6 +274,7 @@ object FunctionRegistry { expression[Tan]("tan"), expression[Cot]("cot"), expression[Tanh]("tanh"), + expression[WidthBucket]("width_bucket"), expression[Add]("+"), expression[Subtract]("-"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index fe8ea2a3c6733..5c764956adf0a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -1325,3 +1325,99 @@ case class BRound(child: Expression, scale: Expression) with Serializable with ImplicitCastInputTypes { def this(child: Expression) = this(child, Literal(0)) } + +object WidthBucket { + + def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = { + if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max || + jl.Double.isNaN(min) || jl.Double.isInfinite(min) || + jl.Double.isNaN(max) || jl.Double.isInfinite(max)) { + return null + } + + val lower = Math.min(min, max) + val upper = Math.max(min, max) + + if (min < max) { + if (value < lower) { + 0L + } else if (value >= upper) { + numBucket + 1L + } else { + (numBucket.toDouble * (value - lower) / (upper - lower)).toLong + 1L + } + } else { // `min > max` case + if (value > upper) { + 0L + } else if (value <= lower) { + numBucket + 1L + } else { + (numBucket.toDouble * (upper - value) / (upper - lower)).toLong + 1L + } + } + } +} + +/** + * Returns the bucket number into which the value of this expression would fall + * after being evaluated. Note that input arguments must follow conditions listed below; + * otherwise, the method will return null. + * - `numBucket` must be greater than zero and be less than Long.MaxValue + * - `value`, `min`, and `max` cannot be NaN + * - `min` bound cannot equal `max` + * - `min` and `max` must be finite + * + * Note: If `minValue` > `maxValue`, a return value is as follows; + * if `value` > `minValue`, it returns 0. + * if `value` <= `maxValue`, it returns `numBucket` + 1. + * otherwise, it returns (`numBucket` * (`minValue` - `value`) / (`minValue` - `maxValue`)) + 1 + * + * @param value is the expression to compute a bucket number in the histogram + * @param minValue is the minimum value of the histogram + * @param maxValue is the maximum value of the histogram + * @param numBucket is the number of buckets + */ +@ExpressionDescription( + usage = """ + _FUNC_(value, min_value, max_value, num_bucket) - Returns the bucket number to which + `value` would be assigned in an equiwidth histogram with `num_bucket` buckets, + in the range `min_value` to `max_value`." + """, + examples = """ + Examples: + > SELECT _FUNC_(5.3, 0.2, 10.6, 5); + 3 + > SELECT _FUNC_(-2.1, 1.3, 3.4, 3); + 0 + > SELECT _FUNC_(8.1, 0.0, 5.7, 4); + 5 + > SELECT _FUNC_(-0.9, 5.2, 0.5, 2); + 3 + """, + since = "3.1.0") +case class WidthBucket( + value: Expression, + minValue: Expression, + maxValue: Expression, + numBucket: Expression) + extends QuaternaryExpression with ImplicitCastInputTypes with NullIntolerant { + + override def children: Seq[Expression] = Seq(value, minValue, maxValue, numBucket) + override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType, DoubleType, LongType) + override def dataType: DataType = LongType + override def nullable: Boolean = true + + override protected def nullSafeEval(input: Any, min: Any, max: Any, numBucket: Any): Any = { + WidthBucket.computeBucketNumber( + input.asInstanceOf[Double], + min.asInstanceOf[Double], + max.asInstanceOf[Double], + numBucket.asInstanceOf[Long]) + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + defineCodeGen(ctx, ev, (input, min, max, numBucket) => + "org.apache.spark.sql.catalyst.expressions.WidthBucket" + + s".computeBucketNumber($input, $min, $max, $numBucket)") + } +} diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index d245aa5a17345..1a9d5bb775a0a 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 337 + - Number of queries: 338 - Number of expressions that missing example: 34 - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,struct,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch ## Schema of Built-in Functions @@ -291,6 +291,7 @@ | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct | | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct | | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct | +| org.apache.spark.sql.catalyst.expressions.WidthBucket | width_bucket | SELECT width_bucket(5.3, 0.2, 10.6, 5) | struct | | org.apache.spark.sql.catalyst.expressions.XxHash64 | xxhash64 | SELECT xxhash64('Spark', array(123), 2) | struct | | org.apache.spark.sql.catalyst.expressions.Year | year | SELECT year('2016-07-30') | struct | | org.apache.spark.sql.catalyst.expressions.ZipWith | zip_with | SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)) | struct>> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql index 20bf0eb15c5b2..c296fa5f7e87e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql @@ -81,3 +81,17 @@ select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11); -- pmod select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null); select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint)); + +-- width_bucket +select width_bucket(5.35, 0.024, 10.06, 5); +select width_bucket(5.35, 0.024, 10.06, 3 + 2); +select width_bucket('5.35', '0.024', '10.06', '5'); +select width_bucket(5.35, 0.024, 10.06, 2.5); +select width_bucket(5.35, 0.024, 10.06, 0.5); +select width_bucket(null, 0.024, 10.06, 5); +select width_bucket(5.35, null, 10.06, 5); +select width_bucket(5.35, 0.024, null, -5); +select width_bucket(5.35, 0.024, 10.06, null); +select width_bucket(5.35, 0.024, 10.06, -5); +select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L); -- long max value +select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1); diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql index dbdb2cace0e0c..53f2aa41ae3fa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql @@ -754,22 +754,22 @@ DROP TABLE ceil_floor_round; -- round((2.5 * 10 ^ i)::numeric, -i) -- FROM generate_series(-5,5) AS t(i); --- [SPARK-21117] Built-in SQL Function Support - WIDTH_BUCKET -- Testing for width_bucket(). For convenience, we test both the -- numeric and float8 versions of the function in this file. -- errors --- SELECT width_bucket(5.0, 3.0, 4.0, 0); --- SELECT width_bucket(5.0, 3.0, 4.0, -5); --- SELECT width_bucket(3.5, 3.0, 3.0, 888); --- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0); --- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5); --- SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888); --- SELECT width_bucket('NaN', 3.0, 4.0, 888); --- SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888); +SELECT width_bucket(5.0, 3.0, 4.0, 0); +SELECT width_bucket(5.0, 3.0, 4.0, -5); +SELECT width_bucket(3.5, 3.0, 3.0, 888); +SELECT width_bucket(double(5.0), double(3.0), double(4.0), 0); +SELECT width_bucket(double(5.0), double(3.0), double(4.0), -5); +SELECT width_bucket(double(3.5), double(3.0), double(3.0), 888); +SELECT width_bucket('NaN', 3.0, 4.0, 888); +SELECT width_bucket(double(0), 'NaN', double(4.0), 888); -- normal operation -- CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8); +CREATE TABLE width_bucket_test (operand_num decimal(30,15), operand_f8 double) USING parquet; -- COPY width_bucket_test (operand_num) FROM stdin; -- -5.2 @@ -795,28 +795,50 @@ DROP TABLE ceil_floor_round; -- UPDATE width_bucket_test SET operand_f8 = operand_num::float8; --- SELECT --- operand_num, --- width_bucket(operand_num, 0, 10, 5) AS wb_1, --- width_bucket(operand_f8, 0, 10, 5) AS wb_1f, --- width_bucket(operand_num, 10, 0, 5) AS wb_2, --- width_bucket(operand_f8, 10, 0, 5) AS wb_2f, --- width_bucket(operand_num, 2, 8, 4) AS wb_3, --- width_bucket(operand_f8, 2, 8, 4) AS wb_3f, --- width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4, --- width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f, --- width_bucket(operand_num, -25, 25, 10) AS wb_5, --- width_bucket(operand_f8, -25, 25, 10) AS wb_5f --- FROM width_bucket_test; +INSERT INTO width_bucket_test VALUES + (-5.2, -5.2), + (-0.0000000001, -0.0000000001), + (0.000000000001, 0.000000000001), + (1, 1), + (1.99999999999999, 1.99999999999999), + (2, 2), + (2.00000000000001, 2.00000000000001), + (3, 3), + (4, 4), + (4.5, 4.5), + (5, 5), + (5.5, 5.5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (9.99999999999999, 9.99999999999999), + (10, 10), + (10.0000000000001, 10.0000000000001); + +SELECT + operand_num, + width_bucket(operand_num, 0, 10, 5) AS wb_1, + width_bucket(operand_f8, 0, 10, 5) AS wb_1f, + width_bucket(operand_num, 10, 0, 5) AS wb_2, + width_bucket(operand_f8, 10, 0, 5) AS wb_2f, + width_bucket(operand_num, 2, 8, 4) AS wb_3, + width_bucket(operand_f8, 2, 8, 4) AS wb_3f, + width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4, + width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f, + width_bucket(operand_num, -25, 25, 10) AS wb_5, + width_bucket(operand_f8, -25, 25, 10) AS wb_5f + FROM width_bucket_test + ORDER BY operand_num ASC; -- for float8 only, check positive and negative infinity: we require -- finite bucket bounds, but allow an infinite operand --- SELECT width_bucket(0.0::float8, 'Infinity'::float8, 5, 10); -- error --- SELECT width_bucket(0.0::float8, 5, '-Infinity'::float8, 20); -- error --- SELECT width_bucket('Infinity'::float8, 1, 10, 10), --- width_bucket('-Infinity'::float8, 1, 10, 10); +SELECT width_bucket(double(0.0), double('Infinity'), 5, 10); -- error +SELECT width_bucket(double(0.0), 5, double('-Infinity'), 20); -- error +SELECT width_bucket(double('Infinity'), 1, 10, 10), + width_bucket(double('-Infinity'), 1, 10, 10); --- DROP TABLE width_bucket_test; +DROP TABLE width_bucket_test; -- [SPARK-28137] Missing Data Type Formatting Functions: TO_CHAR -- TO_CHAR() diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index 9accc57d0bf60..fc58bc784c82f 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 57 +-- Number of queries: 69 -- !query @@ -456,3 +456,99 @@ select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint) struct -- !query output NULL NULL + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 5) +-- !query schema +struct +-- !query output +3 + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 3 + 2) +-- !query schema +struct +-- !query output +3 + + +-- !query +select width_bucket('5.35', '0.024', '10.06', '5') +-- !query schema +struct +-- !query output +3 + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 2.5) +-- !query schema +struct +-- !query output +2 + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 0.5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(null, 0.024, 10.06, 5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, null, 10.06, 5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, 0.024, null, -5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, 0.024, 10.06, null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, 0.024, 10.06, -5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1) +-- !query schema +struct +-- !query output +4894746858139549697 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out index 7b7aeb4ec7934..3cade4f1f9558 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 577 +-- Number of queries: 592 -- !query @@ -4423,6 +4423,177 @@ struct<> +-- !query +SELECT width_bucket(5.0, 3.0, 4.0, 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(5.0, 3.0, 4.0, -5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(3.5, 3.0, 3.0, 888) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double(5.0), double(3.0), double(4.0), 0) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double(5.0), double(3.0), double(4.0), -5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double(3.5), double(3.0), double(3.0), 888) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket('NaN', 3.0, 4.0, 888) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double(0), 'NaN', double(4.0), 888) +-- !query schema +struct +-- !query output +NULL + + +-- !query +CREATE TABLE width_bucket_test (operand_num decimal(30,15), operand_f8 double) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO width_bucket_test VALUES + (-5.2, -5.2), + (-0.0000000001, -0.0000000001), + (0.000000000001, 0.000000000001), + (1, 1), + (1.99999999999999, 1.99999999999999), + (2, 2), + (2.00000000000001, 2.00000000000001), + (3, 3), + (4, 4), + (4.5, 4.5), + (5, 5), + (5.5, 5.5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (9.99999999999999, 9.99999999999999), + (10, 10), + (10.0000000000001, 10.0000000000001) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT + operand_num, + width_bucket(operand_num, 0, 10, 5) AS wb_1, + width_bucket(operand_f8, 0, 10, 5) AS wb_1f, + width_bucket(operand_num, 10, 0, 5) AS wb_2, + width_bucket(operand_f8, 10, 0, 5) AS wb_2f, + width_bucket(operand_num, 2, 8, 4) AS wb_3, + width_bucket(operand_f8, 2, 8, 4) AS wb_3f, + width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4, + width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f, + width_bucket(operand_num, -25, 25, 10) AS wb_5, + width_bucket(operand_f8, -25, 25, 10) AS wb_5f + FROM width_bucket_test + ORDER BY operand_num ASC +-- !query schema +struct +-- !query output +-5.200000000000000 0 0 6 6 0 0 0 0 4 4 +-0.000000000100000 0 0 6 6 0 0 0 0 5 5 +0.000000000001000 1 1 5 5 0 0 0 0 6 6 +1.000000000000000 1 1 5 5 0 0 0 0 6 6 +1.999999999999990 1 1 5 5 0 0 0 0 6 6 +2.000000000000000 2 2 5 5 1 1 0 0 6 6 +2.000000000000010 2 2 4 4 1 1 0 0 6 6 +3.000000000000000 2 2 4 4 1 1 0 0 6 6 +4.000000000000000 3 3 4 4 2 2 0 0 6 6 +4.500000000000000 3 3 3 3 2 2 0 0 6 6 +5.000000000000000 3 3 3 3 3 3 1 1 7 7 +5.500000000000000 3 3 3 3 3 3 21 21 7 7 +6.000000000000000 4 4 3 3 3 3 21 21 7 7 +7.000000000000000 4 4 2 2 4 4 21 21 7 7 +8.000000000000000 5 5 2 2 5 5 21 21 7 7 +9.000000000000000 5 5 1 1 5 5 21 21 7 7 +9.999999999999990 5 5 1 1 5 5 21 21 7 7 +10.000000000000000 6 6 1 1 5 5 21 21 8 8 +10.000000000000100 6 6 0 0 5 5 21 21 8 8 + + +-- !query +SELECT width_bucket(double(0.0), double('Infinity'), 5, 10) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double(0.0), 5, double('-Infinity'), 20) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT width_bucket(double('Infinity'), 1, 10, 10), + width_bucket(double('-Infinity'), 1, 10, 10) +-- !query schema +struct +-- !query output +11 0 + + +-- !query +DROP TABLE width_bucket_test +-- !query schema +struct<> +-- !query output + + + -- !query CREATE TABLE num_input_test (n1 decimal(38, 18)) USING parquet -- !query schema