-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21117][SQL] Built-in SQL Function Support - WIDTH_BUCKET #28764
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1325,3 +1325,99 @@ case class BRound(child: Expression, scale: Expression) | |
| with Serializable with ImplicitCastInputTypes { | ||
| def this(child: Expression) = this(child, Literal(0)) | ||
| } | ||
|
|
||
| object WidthBucket { | ||
|
|
||
| def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = { | ||
| if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max || | ||
| jl.Double.isNaN(min) || jl.Double.isInfinite(min) || | ||
| jl.Double.isNaN(max) || jl.Double.isInfinite(max)) { | ||
| return null | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| val lower = Math.min(min, max) | ||
| val upper = Math.max(min, max) | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if (min < max) { | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (value < lower) { | ||
| 0L | ||
| } else if (value >= upper) { | ||
| numBucket + 1L | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else { | ||
| (numBucket.toDouble * (value - lower) / (upper - lower)).toLong + 1L | ||
| } | ||
| } else { // `min > max` case | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (value > upper) { | ||
| 0L | ||
| } else if (value <= lower) { | ||
| numBucket + 1L | ||
| } else { | ||
| (numBucket.toDouble * (upper - value) / (upper - lower)).toLong + 1L | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the bucket number into which the value of this expression would fall | ||
| * after being evaluated. Note that input arguments must follow conditions listed below; | ||
| * otherwise, the method will return null. | ||
| * - `numBucket` must be greater than zero and be less than Long.MaxValue | ||
| * - `value`, `min`, and `max` cannot be NaN | ||
| * - `min` bound cannot equal `max` | ||
| * - `min` and `max` must be finite | ||
| * | ||
| * Note: If `minValue` > `maxValue`, a return value is as follows; | ||
| * if `value` > `minValue`, it returns 0. | ||
| * if `value` <= `maxValue`, it returns `numBucket` + 1. | ||
| * otherwise, it returns (`numBucket` * (`minValue` - `value`) / (`minValue` - `maxValue`)) + 1 | ||
| * | ||
| * @param value is the expression to compute a bucket number in the histogram | ||
| * @param minValue is the minimum value of the histogram | ||
| * @param maxValue is the maximum value of the histogram | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| * @param numBucket is the number of buckets | ||
| */ | ||
| @ExpressionDescription( | ||
| usage = """ | ||
| _FUNC_(value, min_value, max_value, num_bucket) - Returns the bucket number to which | ||
| `value` would be assigned in an equiwidth histogram with `num_bucket` buckets, | ||
| in the range `min_value` to `max_value`." | ||
| """, | ||
| examples = """ | ||
| Examples: | ||
| > SELECT _FUNC_(5.3, 0.2, 10.6, 5); | ||
| 3 | ||
| > SELECT _FUNC_(-2.1, 1.3, 3.4, 3); | ||
| 0 | ||
| > SELECT _FUNC_(8.1, 0.0, 5.7, 4); | ||
| 5 | ||
| > SELECT _FUNC_(-0.9, 5.2, 0.5, 2); | ||
| 3 | ||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """, | ||
| since = "3.1.0") | ||
| case class WidthBucket( | ||
| value: Expression, | ||
| minValue: Expression, | ||
| maxValue: Expression, | ||
| numBucket: Expression) | ||
| extends QuaternaryExpression with ImplicitCastInputTypes with NullIntolerant { | ||
|
|
||
| override def children: Seq[Expression] = Seq(value, minValue, maxValue, numBucket) | ||
| override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType, DoubleType, DoubleType, LongType) | ||
| override def dataType: DataType = LongType | ||
| override def nullable: Boolean = true | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. override def prettyName: String = "width_bucket" ? |
||
|
|
||
| override protected def nullSafeEval(input: Any, min: Any, max: Any, numBucket: Any): Any = { | ||
| WidthBucket.computeBucketNumber( | ||
| input.asInstanceOf[Double], | ||
| min.asInstanceOf[Double], | ||
| max.asInstanceOf[Double], | ||
| numBucket.asInstanceOf[Long]) | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| defineCodeGen(ctx, ev, (input, min, max, numBucket) => | ||
| "org.apache.spark.sql.catalyst.expressions.WidthBucket" + | ||
| s".computeBucketNumber($input, $min, $max, $numBucket)") | ||
viirya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| <!-- Automatically generated by ExpressionsSchemaSuite --> | ||
| ## Summary | ||
| - Number of queries: 337 | ||
| - Number of queries: 338 | ||
| - Number of expressions that missing example: 34 | ||
| - Expressions missing examples: and,string,tinyint,double,smallint,date,decimal,boolean,float,binary,bigint,int,timestamp,struct,cume_dist,dense_rank,input_file_block_length,input_file_block_start,input_file_name,lag,lead,monotonically_increasing_id,ntile,!,not,or,percent_rank,rank,row_number,spark_partition_id,version,window,positive,count_min_sketch | ||
| ## Schema of Built-in Functions | ||
|
|
@@ -291,6 +291,7 @@ | |
| | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct<uuid():string> | | ||
| | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct<weekday(CAST(2009-07-30 AS DATE)):int> | | ||
| | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct<weekofyear(CAST(2008-02-20 AS DATE)):int> | | ||
| | org.apache.spark.sql.catalyst.expressions.WidthBucket | width_bucket | SELECT width_bucket(5.3, 0.2, 10.6, 5) | struct<widthbucket(CAST(5.3 AS DOUBLE), CAST(0.2 AS DOUBLE), CAST(10.6 AS DOUBLE), CAST(5 AS BIGINT)):bigint> | | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The output schema is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, it looks better. I'll fix it later. |
||
| | org.apache.spark.sql.catalyst.expressions.XxHash64 | xxhash64 | SELECT xxhash64('Spark', array(123), 2) | struct<xxhash64(Spark, array(123), 2):bigint> | | ||
| | org.apache.spark.sql.catalyst.expressions.Year | year | SELECT year('2016-07-30') | struct<year(CAST(2016-07-30 AS DATE)):int> | | ||
| | org.apache.spark.sql.catalyst.expressions.ZipWith | zip_with | SELECT zip_with(array(1, 2, 3), array('a', 'b', 'c'), (x, y) -> (y, x)) | struct<zip_with(array(1, 2, 3), array(a, b, c), lambdafunction(named_struct(y, namedlambdavariable(), x, namedlambdavariable()), namedlambdavariable(), namedlambdavariable())):array<struct<y:string,x:int>>> | | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.