-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23587][SQL] Add interpreted execution for MapObjects expression #20771
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
c55a634
f86f40e
3627dc3
07f8143
9144287
e725608
f0ba614
d4f0ecb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.objects | |
|
|
||
| import java.lang.reflect.Modifier | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable.Builder | ||
| import scala.language.existentials | ||
| import scala.reflect.ClassTag | ||
|
|
@@ -501,12 +502,22 @@ case class LambdaVariable( | |
| value: String, | ||
| isNull: String, | ||
| dataType: DataType, | ||
| nullable: Boolean = true) extends LeafExpression | ||
| with Unevaluable with NonSQLExpression { | ||
| nullable: Boolean = true) extends LeafExpression with NonSQLExpression { | ||
|
|
||
| // Interpreted execution of `LambdaVariable` always get the 0-index element from input row. | ||
| override def eval(input: InternalRow): Any = { | ||
| assert(input.numFields == 1, | ||
| "The input row of interpreted LambdaVariable should have only 1 field.") | ||
| input.get(0, dataType) | ||
| } | ||
|
|
||
| override def genCode(ctx: CodegenContext): ExprCode = { | ||
| ExprCode(code = "", value = value, isNull = if (nullable) isNull else "false") | ||
| } | ||
|
|
||
| // This won't be called as `genCode` is overrided, just overriding it to make | ||
| // `LambdaVariable` non-abstract. | ||
| override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ev | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -599,8 +610,71 @@ case class MapObjects private( | |
|
|
||
| override def children: Seq[Expression] = lambdaFunction :: inputData :: Nil | ||
|
|
||
| override def eval(input: InternalRow): Any = | ||
| throw new UnsupportedOperationException("Only code-generated evaluation is supported") | ||
| // The data with PythonUserDefinedType are actually stored with the data type of its sqlType. | ||
| // When we want to apply MapObjects on it, we have to use it. | ||
| lazy private val inputDataType = inputData.dataType match { | ||
| case p: PythonUserDefinedType => p.sqlType | ||
|
||
| case _ => inputData.dataType | ||
| } | ||
|
|
||
| private def executeFuncOnCollection(inputCollection: Seq[_]): Seq[_] = { | ||
| inputCollection.map { element => | ||
| val row = InternalRow.fromSeq(Seq(element)) | ||
|
||
| lambdaFunction.eval(row) | ||
| } | ||
| } | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val inputCollection = inputData.eval(input) | ||
|
|
||
| if (inputCollection == null) { | ||
| return inputCollection | ||
|
||
| } | ||
|
|
||
| val results = inputDataType match { | ||
|
||
| case ObjectType(cls) if classOf[Seq[_]].isAssignableFrom(cls) => | ||
| executeFuncOnCollection(inputCollection.asInstanceOf[Seq[_]]) | ||
| case ObjectType(cls) if cls.isArray => | ||
| executeFuncOnCollection(inputCollection.asInstanceOf[Array[_]].toSeq) | ||
| case ObjectType(cls) if classOf[java.util.List[_]].isAssignableFrom(cls) => | ||
| executeFuncOnCollection(inputCollection.asInstanceOf[java.util.List[_]].asScala) | ||
| case ObjectType(cls) if cls == classOf[Object] => | ||
| if (inputCollection.getClass.isArray) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (I am sorry for sounding like a broken record) But can we move this check out of the the function closure?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry... |
||
| executeFuncOnCollection(inputCollection.asInstanceOf[Array[_]].toSeq) | ||
| } else { | ||
| executeFuncOnCollection(inputCollection.asInstanceOf[Seq[_]]) | ||
| } | ||
| case ArrayType(et, _) => | ||
| executeFuncOnCollection(inputCollection.asInstanceOf[ArrayData].array) | ||
| } | ||
|
|
||
| customCollectionCls match { | ||
|
||
| case Some(cls) if classOf[Seq[_]].isAssignableFrom(cls) => | ||
| // Scala sequence | ||
| results.toSeq | ||
| case Some(cls) if classOf[scala.collection.Set[_]].isAssignableFrom(cls) => | ||
| // Scala set | ||
| results.toSet | ||
| case Some(cls) if classOf[java.util.List[_]].isAssignableFrom(cls) => | ||
| // Java list | ||
| if (cls == classOf[java.util.List[_]] || cls == classOf[java.util.AbstractList[_]] || | ||
| cls == classOf[java.util.AbstractSequentialList[_]]) { | ||
| results.asJava | ||
| } else { | ||
| val builder = Try(cls.getConstructor(Integer.TYPE)).map { constructor => | ||
|
||
| constructor.newInstance(results.length.asInstanceOf[Object]) | ||
| }.getOrElse { | ||
| cls.getConstructor().newInstance() | ||
| }.asInstanceOf[java.util.List[Any]] | ||
|
|
||
| results.foreach(builder.add(_)) | ||
| builder | ||
| } | ||
| case None => | ||
| // array | ||
| new GenericArrayData(results.toArray) | ||
| } | ||
| } | ||
|
|
||
| override def dataType: DataType = | ||
| customCollectionCls.map(ObjectType.apply).getOrElse( | ||
|
|
@@ -647,13 +721,6 @@ case class MapObjects private( | |
| case _ => "" | ||
| } | ||
|
|
||
| // The data with PythonUserDefinedType are actually stored with the data type of its sqlType. | ||
| // When we want to apply MapObjects on it, we have to use it. | ||
| val inputDataType = inputData.dataType match { | ||
| case p: PythonUserDefinedType => p.sqlType | ||
| case _ => inputData.dataType | ||
| } | ||
|
|
||
| // `MapObjects` generates a while loop to traverse the elements of the input collection. We | ||
| // need to take care of Seq and List because they may have O(n) complexity for indexed accessing | ||
| // like `list.get(1)`. Here we use Iterator to traverse Seq and List. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a change for this PR. Maybe we should use accessors here? This uses a matching under the hood and is slower than virtual function dispatch. Implementing this would also be useful for BoundReference for example.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean something like this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's spin that off into a different ticket if we want to work on it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. After this is merged, I will create another PR for it.