-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3891][SQL] Add array support to percentile, percentile_approx and constant inspectors support #2802
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-3891][SQL] Add array support to percentile, percentile_approx and constant inspectors support #2802
Changes from 7 commits
7f94aff
cb7c61e
47f6365
f37fd69
4d39105
c46db0f
a18f917
a0182e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,6 +164,11 @@ private[hive] case class HiveGenericUdf(functionClassName: String, children: Seq | |
| override def foldable = | ||
| isUDFDeterministic && returnInspector.isInstanceOf[ConstantObjectInspector] | ||
|
|
||
| @transient | ||
| protected lazy val constantReturnValue = unwrap( | ||
| returnInspector.asInstanceOf[ConstantObjectInspector].getWritableConstantValue(), | ||
| returnInspector) | ||
|
|
||
| @transient | ||
| protected lazy val deferedObjects = | ||
| argumentInspectors.map(new DeferredObjectAdapter(_)).toArray[DeferredObject] | ||
|
|
@@ -172,6 +177,8 @@ private[hive] case class HiveGenericUdf(functionClassName: String, children: Seq | |
|
|
||
| override def eval(input: Row): Any = { | ||
| returnInspector // Make sure initialized. | ||
| if(foldable) return constantReturnValue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably unnecessary, as constant folding rule in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Constant check and returning value is required for two reasons:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation, I guess there is probably a bug in master as you described, can you paste a query to reproduce the failure? (Text V.S. String).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In HiveQuerySuite, "constant array" testcase was failing [info] - constant array *** FAILED *** (596 milliseconds)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenghao-intel, I think you understand this code better than I do. Are you satisfied with the explanation? Does this approach seem reasonable?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this failure is due to the bug of nested constant Expression <-> ObjectInspector in @gvramana , how about revert the changes in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenghao-intel This PR cannot be separately merged without CreateArray, as percentile_approx accepts only constant array iterator and fails otherwise. I think we can go ahead and merge all these changes as they don't break build or tests, and are not directly dependent on #3429 in order of merge.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| var i = 0 | ||
| while (i < children.length) { | ||
| val idx = i | ||
|
|
@@ -198,12 +205,13 @@ private[hive] case class HiveGenericUdaf( | |
|
|
||
| @transient | ||
| protected lazy val objectInspector = { | ||
| resolver.getEvaluator(children.map(_.dataType.toTypeInfo).toArray) | ||
| val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false) | ||
| resolver.getEvaluator(parameterInfo) | ||
| .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray) | ||
| } | ||
|
|
||
| @transient | ||
| protected lazy val inspectors = children.map(_.dataType).map(toInspector) | ||
| protected lazy val inspectors = children.map(toInspector) | ||
|
|
||
| def dataType: DataType = inspectorToDataType(objectInspector) | ||
|
|
||
|
|
@@ -228,12 +236,13 @@ private[hive] case class HiveUdaf( | |
|
|
||
| @transient | ||
| protected lazy val objectInspector = { | ||
| resolver.getEvaluator(children.map(_.dataType.toTypeInfo).toArray) | ||
| val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors.toArray, false, false) | ||
| resolver.getEvaluator(parameterInfo) | ||
| .init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors.toArray) | ||
| } | ||
|
|
||
| @transient | ||
| protected lazy val inspectors = children.map(_.dataType).map(toInspector) | ||
| protected lazy val inspectors = children.map(toInspector) | ||
|
|
||
| def dataType: DataType = inspectorToDataType(objectInspector) | ||
|
|
||
|
|
@@ -266,7 +275,7 @@ private[hive] case class HiveGenericUdtf( | |
| protected lazy val function: GenericUDTF = createFunction() | ||
|
|
||
| @transient | ||
| protected lazy val inputInspectors = children.map(_.dataType).map(toInspector) | ||
| protected lazy val inputInspectors = children.map(toInspector) | ||
|
|
||
| @transient | ||
| protected lazy val outputInspector = function.initialize(inputInspectors.toArray) | ||
|
|
@@ -340,10 +349,13 @@ private[hive] case class HiveUdafFunction( | |
| } else { | ||
| createFunction[AbstractGenericUDAFResolver]() | ||
| } | ||
|
|
||
| private val inspectors = exprs.map(_.dataType).map(toInspector).toArray | ||
|
|
||
| private val function = resolver.getEvaluator(exprs.map(_.dataType.toTypeInfo).toArray) | ||
|
|
||
| private val inspectors = exprs.map(toInspector).toArray | ||
|
|
||
| private val function = { | ||
| val parameterInfo = new SimpleGenericUDAFParameterInfo(inspectors, false, false) | ||
| resolver.getEvaluator(parameterInfo) | ||
| } | ||
|
|
||
| private val returnInspector = function.init(GenericUDAFEvaluator.Mode.COMPLETE, inspectors) | ||
|
|
||
|
|
@@ -356,8 +368,11 @@ private[hive] case class HiveUdafFunction( | |
| @transient | ||
| val inputProjection = new InterpretedProjection(exprs) | ||
|
|
||
| @transient | ||
| protected lazy val cached = new Array[AnyRef](exprs.length) | ||
|
|
||
| def update(input: Row): Unit = { | ||
| val inputs = inputProjection(input).asInstanceOf[Seq[AnyRef]].toArray | ||
| function.iterate(buffer, inputs) | ||
| function.iterate(buffer, wrap(inputs, inspectors, cached)) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this just be a
def? It will only be called once correct?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, fixed the same.