apache · maropu · Jun 16, 2017 · Jun 9, 2020 · Jun 9, 2020 · Jun 10, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -274,6 +274,7 @@ object FunctionRegistry {
     expression[Tan]("tan"),
     expression[Cot]("cot"),
     expression[Tanh]("tanh"),
+    expression[WidthBucket]("width_bucket"),
 
     expression[Add]("+"),
     expression[Subtract]("-"),

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -1325,3 +1325,99 @@ case class BRound(child: Expression, scale: Expression)
     with Serializable with ImplicitCastInputTypes {
   def this(child: Expression) = this(child, Literal(0))
 }
+
+object WidthBucket {
+
+  def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
+    if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max ||
+        jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
+        jl.Double.isNaN(max) || jl.Double.isInfinite(max)) {
+      return null
+    }
+
+    val lower = Math.min(min, max)
+    val upper = Math.max(min, max)
+
+    if (min < max) {
+      if (value < lower) {
+        0L
+      } else if (value >= upper) {
+        numBucket + 1L
+      } else {
+        (numBucket.toDouble * (value - lower) / (upper - lower)).toLong + 1L
+      }
+    } else { // `min > max` case
+      if (value > upper) {
+        0L
+      } else if (value <= lower) {
+        numBucket + 1L
+      } else {
+        (numBucket.toDouble * (upper - value) / (upper - lower)).toLong + 1L
+      }
+    }
+  }
+}
+
+/**
+ * Returns the bucket number into which the value of this expression would fall
+ * after being evaluated. Note that input arguments must follow conditions listed below;
+ * otherwise, the method will return null.
+ *  - `numBucket` must be greater than zero and be less than Long.MaxValue
+ *  - `value`, `min`, and `max` cannot be NaN
+ *  - `min` bound cannot equal `max`
+ *  - `min` and `max` must be finite
+ *
+ * Note: If `minValue` > `maxValue`, a return value is as follows;
+ *  if `value` > `minValue`, it returns 0.
+ *  if `value` <= `maxValue`, it returns `numBucket` + 1.
+ *  otherwise, it returns (`numBucket` * (`minValue` - `value`) / (`minValue` - `maxValue`)) + 1
+ *
+ * @param value is the expression to compute a bucket number in the histogram
+ * @param minValue is the minimum value of the histogram
+ * @param maxValue is the maximum value of the histogram
+ * @param numBucket is the number of buckets
+ */
+@ExpressionDescription(
+  usage = """
+    _FUNC_(value, min_value, max_value, num_bucket) - Returns the bucket number to which
+      `value` would be assigned in an equiwidth histogram with `num_bucket` buckets,
+      in the range `min_value` to `max_value`."
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(5.3, 0.2, 10.6, 5);
+       3
+      > SELECT _FUNC_(-2.1, 1.3, 3.4, 3);
+       0
+      > SELECT _FUNC_(8.1, 0.0, 5.7, 4);
+       5
+      > SELECT _FUNC_(-0.9, 5.2, 0.5, 2);
+       3
+  """,
+  since = "3.1.0")
+case class WidthBucket(
+    value: Expression,
+    minValue: Expression,
+    maxValue: Expression,
+    numBucket: Expression)
+  extends QuaternaryExpression with ImplicitCastInputTypes with NullIntolerant {
+
+  override def children: Seq[Expression] = Seq(value, minValue, maxValue, numBucket)
+  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType, DoubleType, LongType)
+  override def dataType: DataType = LongType
+  override def nullable: Boolean = true
+
+  override protected def nullSafeEval(input: Any, min: Any, max: Any, numBucket: Any): Any = {
+    WidthBucket.computeBucketNumber(
+      input.asInstanceOf[Double],
+      min.asInstanceOf[Double],
+      max.asInstanceOf[Double],
+      numBucket.asInstanceOf[Long])
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, (input, min, max, numBucket) =>
+      "org.apache.spark.sql.catalyst.expressions.WidthBucket" +
+        s".computeBucketNumber($input, $min, $max, $numBucket)")
+  }
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -1,6 +1,6 @@
 <!-- Automatically generated by ExpressionsSchemaSuite -->
 ## Summary
-  - Number of queries: 337
+  - Number of queries: 338
   - Number of expressions that missing example: 34
   - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,struct,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch
 ## Schema of Built-in Functions
@@ -291,6 +291,7 @@
 | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct<uuid():string> |
 | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct<weekday(CAST(2009-07-30 AS DATE)):int> |
 | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct<weekofyear(CAST(2008-02-20 AS DATE)):int> |
+| org.apache.spark.sql.catalyst.expressions.WidthBucket | width_bucket | SELECT width_bucket(5.3, 0.2, 10.6, 5) | struct<widthbucket(CAST(5.3 AS DOUBLE), CAST(0.2 AS DOUBLE), CAST(10.6 AS DOUBLE), CAST(5 AS BIGINT)):bigint> |
 | org.apache.spark.sql.catalyst.expressions.XxHash64 | xxhash64 | SELECT xxhash64('Spark', array(123), 2) | struct<xxhash64(Spark, array(123), 2):bigint> |
 | org.apache.spark.sql.catalyst.expressions.Year | year | SELECT year('2016-07-30') | struct<year(CAST(2016-07-30 AS DATE)):int> |
 | org.apache.spark.sql.catalyst.expressions.ZipWith | zip_with | SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)) | struct<zip_with(array(1, 2, 3), array(a, b, c), lambdafunction(named_struct(y, namedlambdavariable(), x, namedlambdavariable()), namedlambdavariable(), namedlambdavariable())):array<struct<y:string,x:int>>> |

diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
@@ -81,3 +81,17 @@ select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11);
 -- pmod
 select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null);
 select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint));
+
+-- width_bucket
+select width_bucket(5.35, 0.024, 10.06, 5);
+select width_bucket(5.35, 0.024, 10.06, 3 + 2);
+select width_bucket('5.35', '0.024', '10.06', '5');
+select width_bucket(5.35, 0.024, 10.06, 2.5);
+select width_bucket(5.35, 0.024, 10.06, 0.5);
+select width_bucket(null, 0.024, 10.06, 5);
+select width_bucket(5.35, null, 10.06, 5);
+select width_bucket(5.35, 0.024, null, -5);
+select width_bucket(5.35, 0.024, 10.06, null);
+select width_bucket(5.35, 0.024, 10.06, -5);
+select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L); -- long max value
+select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql
@@ -754,22 +754,22 @@ DROP TABLE ceil_floor_round;
 -- 	round((2.5 * 10 ^ i)::numeric, -i)
 -- FROM generate_series(-5,5) AS t(i);
 
--- [SPARK-21117] Built-in SQL Function Support - WIDTH_BUCKET
 -- Testing for width_bucket(). For convenience, we test both the
 -- numeric and float8 versions of the function in this file.
 
 -- errors
--- SELECT width_bucket(5.0, 3.0, 4.0, 0);
--- SELECT width_bucket(5.0, 3.0, 4.0, -5);
--- SELECT width_bucket(3.5, 3.0, 3.0, 888);
--- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0);
--- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5);
--- SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888);
--- SELECT width_bucket('NaN', 3.0, 4.0, 888);
--- SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
+SELECT width_bucket(5.0, 3.0, 4.0, 0);
+SELECT width_bucket(5.0, 3.0, 4.0, -5);
+SELECT width_bucket(3.5, 3.0, 3.0, 888);
+SELECT width_bucket(double(5.0), double(3.0), double(4.0), 0);
+SELECT width_bucket(double(5.0), double(3.0), double(4.0), -5);
+SELECT width_bucket(double(3.5), double(3.0), double(3.0), 888);
+SELECT width_bucket('NaN', 3.0, 4.0, 888);
+SELECT width_bucket(double(0), 'NaN', double(4.0), 888);
 
 -- normal operation
 -- CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
+CREATE TABLE width_bucket_test (operand_num decimal(30,15), operand_f8 double) USING parquet;
 
 -- COPY width_bucket_test (operand_num) FROM stdin;
 -- -5.2
@@ -795,28 +795,50 @@ DROP TABLE ceil_floor_round;
 
 -- UPDATE width_bucket_test SET operand_f8 = operand_num::float8;
 
--- SELECT
---     operand_num,
---     width_bucket(operand_num, 0, 10, 5) AS wb_1,
---     width_bucket(operand_f8, 0, 10, 5) AS wb_1f,
---     width_bucket(operand_num, 10, 0, 5) AS wb_2,
---     width_bucket(operand_f8, 10, 0, 5) AS wb_2f,
---     width_bucket(operand_num, 2, 8, 4) AS wb_3,
---     width_bucket(operand_f8, 2, 8, 4) AS wb_3f,
---     width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4,
---     width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f,
---     width_bucket(operand_num, -25, 25, 10) AS wb_5,
---     width_bucket(operand_f8, -25, 25, 10) AS wb_5f
---     FROM width_bucket_test;
+INSERT INTO width_bucket_test VALUES
+    (-5.2, -5.2),
+    (-0.0000000001, -0.0000000001),
+    (0.000000000001, 0.000000000001),
+    (1, 1),
+    (1.99999999999999, 1.99999999999999),
+    (2, 2),
+    (2.00000000000001, 2.00000000000001),
+    (3, 3),
+    (4, 4),
+    (4.5, 4.5),
+    (5, 5),
+    (5.5, 5.5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (9.99999999999999, 9.99999999999999),
+    (10, 10),
+    (10.0000000000001, 10.0000000000001);
+
+SELECT
+    operand_num,
+    width_bucket(operand_num, 0, 10, 5) AS wb_1,
+    width_bucket(operand_f8, 0, 10, 5) AS wb_1f,
+    width_bucket(operand_num, 10, 0, 5) AS wb_2,
+    width_bucket(operand_f8, 10, 0, 5) AS wb_2f,
+    width_bucket(operand_num, 2, 8, 4) AS wb_3,
+    width_bucket(operand_f8, 2, 8, 4) AS wb_3f,
+    width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4,
+    width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f,
+    width_bucket(operand_num, -25, 25, 10) AS wb_5,
+    width_bucket(operand_f8, -25, 25, 10) AS wb_5f
+    FROM width_bucket_test
+    ORDER BY operand_num ASC;
 
 -- for float8 only, check positive and negative infinity: we require
 -- finite bucket bounds, but allow an infinite operand
--- SELECT width_bucket(0.0::float8, 'Infinity'::float8, 5, 10); -- error
--- SELECT width_bucket(0.0::float8, 5, '-Infinity'::float8, 20); -- error
--- SELECT width_bucket('Infinity'::float8, 1, 10, 10),
---        width_bucket('-Infinity'::float8, 1, 10, 10);
+SELECT width_bucket(double(0.0), double('Infinity'), 5, 10); -- error
+SELECT width_bucket(double(0.0), 5, double('-Infinity'), 20); -- error
+SELECT width_bucket(double('Infinity'), 1, 10, 10),
+       width_bucket(double('-Infinity'), 1, 10, 10);
 
--- DROP TABLE width_bucket_test;
+DROP TABLE width_bucket_test;
 
 -- [SPARK-28137] Missing Data Type Formatting Functions: TO_CHAR
 -- TO_CHAR()

diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 57
+-- Number of queries: 69
 
 
 -- !query
@@ -456,3 +456,99 @@ select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint)
 struct<pmod(CAST(3.13 AS DECIMAL(10,0)), CAST(0 AS DECIMAL(10,0))):decimal(10,0),pmod(CAST(2 AS SMALLINT), CAST(0 AS SMALLINT)):smallint>
 -- !query output
 NULL	NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
+-- !query output
+3
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 3 + 2)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST((3 + 2) AS BIGINT)):bigint>
+-- !query output
+3
+
+
+-- !query
+select width_bucket('5.35', '0.024', '10.06', '5')
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
+-- !query output
+3
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 2.5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(2.5 AS BIGINT)):bigint>
+-- !query output
+2
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 0.5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(0.5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(null, 0.024, 10.06, 5)
+-- !query schema
+struct<widthbucket(CAST(NULL AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, null, 10.06, 5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(NULL AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, null, -5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(NULL AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, null)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(NULL AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, -5)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), CAST(-5 AS BIGINT)):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), 9223372036854775807):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select width_bucket(5.35, 0.024, 10.06, 9223372036854775807L - 1)
+-- !query schema
+struct<widthbucket(CAST(5.35 AS DOUBLE), CAST(0.024 AS DOUBLE), CAST(10.06 AS DOUBLE), (9223372036854775807 - CAST(1 AS BIGINT))):bigint>
+-- !query output
+4894746858139549697