-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16686][SQL] Remove PushProjectThroughSample since it is handled by ColumnPruning #14327
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
9521a5a
6d1616d
31a6f6f
5c4e7ff
20e9436
dc70f1d
2186b7e
3e134f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -150,13 +150,20 @@ class SimpleTestOptimizer extends Optimizer( | |
|
|
||
| /** | ||
| * Pushes projects down beneath Sample to enable column pruning with sampling. | ||
| * This rule is only doable when the projects don't add new attributes. | ||
| */ | ||
| object PushProjectThroughSample extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| // Push down projection into sample | ||
| case Project(projectList, Sample(lb, up, replace, seed, child)) => | ||
| case p @ Project(projectList, Sample(lb, up, replace, seed, child)) | ||
| if !hasNewOutput(projectList, p.child.output) => | ||
| Sample(lb, up, replace, seed, Project(projectList, child))() | ||
| } | ||
| private def hasNewOutput( | ||
| projectList: Seq[NamedExpression], | ||
| childOutput: Seq[Attribute]): Boolean = { | ||
| projectList.exists(p => !childOutput.exists(_.semanticEquals(p))) | ||
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -422,6 +422,35 @@ class DatasetSuite extends QueryTest with SharedSQLContext { | |
| 3, 17, 27, 58, 62) | ||
| } | ||
|
|
||
| test("SPARK-16686: Dataset.sample with seed results shouldn't depend on downstream usage") { | ||
| val udfOne = spark.udf.register("udfOne", (n: Int) => { | ||
|
||
| if (n == 1) { | ||
| throw new RuntimeException("udfOne shouldn't see swid=1!") | ||
|
||
| } else { | ||
| 1 | ||
| } | ||
| }) | ||
|
|
||
| val d = Seq( | ||
| (0, "string0"), | ||
| (1, "string1"), | ||
| (2, "string2"), | ||
| (3, "string3"), | ||
| (4, "string4"), | ||
| (5, "string5"), | ||
| (6, "string6"), | ||
| (7, "string7"), | ||
| (8, "string8"), | ||
| (9, "string9") | ||
| ) | ||
| val df = spark.createDataFrame(d).toDF("swid", "stringData") | ||
|
||
| val sampleDF = df.sample(false, 0.7, 50) | ||
| // After sampling, sampleDF doesn't contain swid=1. | ||
| assert(!sampleDF.select("swid").collect.contains(1)) | ||
| // udfOne should not encounter swid=1. | ||
| sampleDF.select(udfOne($"swid")).collect | ||
|
||
| } | ||
|
|
||
| test("SPARK-11436: we should rebind right encoder when join 2 datasets") { | ||
| val ds1 = Seq("1", "2").toDS().as("a") | ||
| val ds2 = Seq(2, 3).toDS().as("b") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we merge this rule into
ColumnPruning?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, looks like
ColumnPruningalready handles it, can we just remove this rule?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure which part you mean? I don't see
ColumnPruninghandlingSample?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the last case in
ColumnPruning, it will generate a newProjectunderSampleThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yah. I will update this. At least one optimizer test uses this rule. The test should be changed too.