apache · viirya · Jul 23, 2016 · Jul 23, 2016 · Jul 25, 2016 · Jul 25, 2016
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -150,13 +150,20 @@ class SimpleTestOptimizer extends Optimizer(
 
 /**
  * Pushes projects down beneath Sample to enable column pruning with sampling.
+ * This rule is only doable when the projects don't add new attributes.
  */
 object PushProjectThroughSample extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Push down projection into sample
-    case Project(projectList, Sample(lb, up, replace, seed, child)) =>
+    case p @ Project(projectList, Sample(lb, up, replace, seed, child))
+        if !hasNewOutput(projectList, p.child.output) =>
       Sample(lb, up, replace, seed, Project(projectList, child))()
   }
+  private def hasNewOutput(
+      projectList: Seq[NamedExpression],
+      childOutput: Seq[Attribute]): Boolean = {
+    projectList.exists(p => !childOutput.exists(_.semanticEquals(p)))
+  }
 }
 
 /**

diff --git a/...catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/...catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -601,6 +601,22 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer.analyze)
   }
 
+  test("don't push project down into sample if project brings new attributes") {
+    val x = testRelation.subquery('x)
+    val originalQuery =
+      Sample(0.0, 0.6, false, 11L, x)().select('a as 'aa)
+
+    val originalQueryAnalyzed =
+      EliminateSubqueryAliases(analysis.SimpleAnalyzer.execute(originalQuery))
+
+    val optimized = Optimize.execute(originalQueryAnalyzed)
+
+    val correctAnswer =
+      Sample(0.0, 0.6, false, 11L, x)().select('a as 'aa)
+
+    comparePlans(optimized, correctAnswer.analyze)
+  }
+
   test("aggregate: push down filter when filter on group by expression") {
     val originalQuery = testRelation
                         .groupBy('a)('a, count('b) as 'c)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -422,6 +422,35 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       3, 17, 27, 58, 62)
   }
 
+  test("SPARK-16686: Dataset.sample with seed results shouldn't depend on downstream usage") {
+    val udfOne = spark.udf.register("udfOne", (n: Int) => {
+      if (n == 1) {
+        throw new RuntimeException("udfOne shouldn't see swid=1!")
+      } else {
+        1
+      }
+    })
+
+    val d = Seq(
+      (0, "string0"),
+      (1, "string1"),
+      (2, "string2"),
+      (3, "string3"),
+      (4, "string4"),
+      (5, "string5"),
+      (6, "string6"),
+      (7, "string7"),
+      (8, "string8"),
+      (9, "string9")
+    )
+    val df = spark.createDataFrame(d).toDF("swid", "stringData")
+    val sampleDF = df.sample(false, 0.7, 50)
+    // After sampling, sampleDF doesn't contain swid=1.
+    assert(!sampleDF.select("swid").collect.contains(1))
+    // udfOne should not encounter swid=1.
+    sampleDF.select(udfOne($"swid")).collect
+  }
+
   test("SPARK-11436: we should rebind right encoder when join 2 datasets") {
     val ds1 = Seq("1", "2").toDS().as("a")
     val ds2 = Seq(2, 3).toDS().as("b")