-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15094][SPARK-14803][SQL] Remove extra Project added in EliminateSerialization #12926
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
cf53a43
48e6b6d
737c518
3d0554d
4b0773a
29a0c70
85fba17
ea55398
c3748ba
882fc66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -102,7 +102,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) | |
| SimplifyCasts, | ||
| SimplifyCaseConversionExpressions, | ||
| RewriteCorrelatedScalarSubquery, | ||
| EliminateSerialization) :: | ||
| EliminateSerialization, | ||
| RemoveAliasOnlyProject) :: | ||
| Batch("Decimal Optimizations", fixedPoint, | ||
| DecimalAggregates) :: | ||
| Batch("Typed Filter Optimization", fixedPoint, | ||
|
|
@@ -155,6 +156,59 @@ object SamplePushDown extends Rule[LogicalPlan] { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Removes the Project only conducting Alias of its child node. | ||
| * It is created mainly for removing extra Project added in EliminateSerialization rule, | ||
| * but can also benefit other operators. | ||
| */ | ||
| object RemoveAliasOnlyProject extends Rule[LogicalPlan] { | ||
| // Check if projectList in the Project node has the same attribute names and ordering | ||
| // as its child node. | ||
| private def checkAliasOnly( | ||
| projectList: Seq[NamedExpression], | ||
| childOutput: Seq[Attribute]): Boolean = { | ||
| if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != childOutput.length) { | ||
| return false | ||
| } else { | ||
| projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case (a, o) => | ||
| a.child match { | ||
| case attr: Attribute | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't it just |
||
| if a.name == attr.name && attr.name == o.name && attr.dataType == o.dataType | ||
| && attr.exprId == o.exprId => | ||
| true | ||
| case _ => false | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = { | ||
| val processedPlan = plan.find { p => | ||
| p match { | ||
| case Project(pList, child) if checkAliasOnly(pList, child.output) => true | ||
| case _ => false | ||
| } | ||
| }.map { case p: Project => | ||
| val attrMap = p.projectList.map { a => | ||
|
||
| val alias = a.asInstanceOf[Alias] | ||
| val replaceFrom = alias.toAttribute.exprId | ||
| val replaceTo = alias.child.asInstanceOf[Attribute] | ||
| (replaceFrom, replaceTo) | ||
| }.toMap | ||
| plan.transformAllExpressions { | ||
| case a: Attribute if attrMap.contains(a.exprId) => attrMap(a.exprId) | ||
| }.transform { | ||
| case op: Project if op == p => op.child | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
| } | ||
| } | ||
| if (processedPlan.isDefined) { | ||
|
||
| processedPlan.get | ||
| } else { | ||
| plan | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Removes cases where we are unnecessarily going between the object and serialized (InternalRow) | ||
| * representation of data item. For example back to back map operations. | ||
|
|
@@ -163,15 +217,10 @@ object EliminateSerialization extends Rule[LogicalPlan] { | |
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case d @ DeserializeToObject(_, _, s: SerializeFromObject) | ||
| if d.outputObjectType == s.inputObjectType => | ||
| // A workaround for SPARK-14803. Remove this after it is fixed. | ||
| if (d.outputObjectType.isInstanceOf[ObjectType] && | ||
| d.outputObjectType.asInstanceOf[ObjectType].cls == classOf[org.apache.spark.sql.Row]) { | ||
| s.child | ||
| } else { | ||
| // Adds an extra Project here, to preserve the output expr id of `DeserializeToObject`. | ||
| val objAttr = Alias(s.child.output.head, "obj")(exprId = d.output.head.exprId) | ||
| Project(objAttr :: Nil, s.child) | ||
| } | ||
| // Adds an extra Project here, to preserve the output expr id of `DeserializeToObject`. | ||
| // We will remove it later in RemoveAliasOnlyProject rule. | ||
| val objAttr = Alias(s.child.output.head, "obj")(exprId = d.output.head.exprId) | ||
|
||
| Project(objAttr :: Nil, s.child) | ||
| case a @ AppendColumns(_, _, _, s: SerializeFromObject) | ||
| if a.deserializer.dataType == s.inputObjectType => | ||
| AppendColumnsWithObject(a.func, s.serializer, a.serializer, s.child) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -659,6 +659,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext { | |
| checkDataset(DatasetTransform.addOne(dataset), 2, 3, 4) | ||
| } | ||
|
|
||
| test("dataset.rdd with generic case class") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this test for?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The failed codes in jira SPARK-15094. |
||
| val ds = Seq(Generic(1, 1.0), Generic(2, 2.0)).toDS | ||
| val ds2 = ds.map(g => Generic(g.id, g.value)) | ||
| ds.rdd.map(r => r.id).count | ||
|
||
| ds2.rdd.map(r => r.id).count | ||
|
|
||
| val ds3 = ds.map(g => new java.lang.Long(g.id)) | ||
| ds3.rdd.map(r => r).count | ||
| } | ||
|
|
||
| test("runtime null check for RowEncoder") { | ||
| val schema = new StructType().add("i", IntegerType, nullable = false) | ||
| val df = sqlContext.range(10).map(l => { | ||
|
|
@@ -676,6 +686,8 @@ class DatasetSuite extends QueryTest with SharedSQLContext { | |
| } | ||
| } | ||
|
|
||
| case class Generic[T](id: T, value: Double) | ||
|
|
||
| case class OtherTuple(_1: String, _2: Int) | ||
|
|
||
| case class TupleClass(data: (Int, String)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
isAliasOnly