Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.encoders.OuterScopes
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
import org.apache.spark.sql.catalyst.expressions.objects._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
Expand Down Expand Up @@ -2145,14 +2145,24 @@ class Analyzer(
val parameterTypes = ScalaReflection.getParameterTypes(func)
assert(parameterTypes.length == inputs.length)

// TODO: skip null handling for not-nullable primitive inputs after we can completely
// trust the `nullable` information.
// (cls, expr) => cls.isPrimitive && expr.nullable
val needsNullCheck = (cls: Class[_], expr: Expression) =>
cls.isPrimitive && !expr.isInstanceOf[AssertNotNull]
val inputsNullCheck = parameterTypes.zip(inputs)
// TODO: skip null handling for not-nullable primitive inputs after we can completely
// trust the `nullable` information.
// .filter { case (cls, expr) => cls.isPrimitive && expr.nullable }
.filter { case (cls, _) => cls.isPrimitive }
.filter { case (cls, expr) => needsNullCheck(cls, expr) }
.map { case (_, expr) => IsNull(expr) }
.reduceLeftOption[Expression]((e1, e2) => Or(e1, e2))
inputsNullCheck.map(If(_, Literal.create(null, udf.dataType), udf)).getOrElse(udf)
// Once we add an `If` check above the udf, it is safe to mark those checked inputs
// as not nullable (i.e., wrap them with `AssertNotNull`), because the null-returning
// branch of `If` will be called if any of these checked inputs is null. Thus we can
// prevent this rule from being applied repeatedly.
val newInputs = parameterTypes.zip(inputs).map{ case (cls, expr) =>
if (needsNullCheck(cls, expr)) AssertNotNull(expr) else expr }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let us introduce KnownNotNull instead of using AssertNotNull, which has a side-effect?

inputsNullCheck
.map(If(_, Literal.create(null, udf.dataType), udf.copy(children = newInputs)))
.getOrElse(udf)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning,
Expand Down Expand Up @@ -316,15 +317,16 @@ class AnalysisSuite extends AnalysisTest with Matchers {

// only primitive parameter needs special null handling
val udf2 = ScalaUDF((s: String, d: Double) => "x", StringType, string :: double :: Nil)
val expected2 = If(IsNull(double), nullResult, udf2)
val expected2 =
If(IsNull(double), nullResult, udf2.copy(children = string :: AssertNotNull(double) :: Nil))
checkUDF(udf2, expected2)

// special null handling should apply to all primitive parameters
val udf3 = ScalaUDF((s: Short, d: Double) => "x", StringType, short :: double :: Nil)
val expected3 = If(
IsNull(short) || IsNull(double),
nullResult,
udf3)
udf3.copy(children = AssertNotNull(short) :: AssertNotNull(double) :: Nil))
checkUDF(udf3, expected3)

// we can skip special null handling for primitive parameters that are not nullable
Expand All @@ -336,10 +338,19 @@ class AnalysisSuite extends AnalysisTest with Matchers {
val expected4 = If(
IsNull(short),
nullResult,
udf4)
udf4.copy(children = AssertNotNull(short) :: double.withNullability(false) :: Nil))
// checkUDF(udf4, expected4)
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
val a = testRelation.output(0)
val func = (x: Int, y: Int) => x + y
val udf1 = ScalaUDF(func, IntegerType, a :: a :: Nil)
val udf2 = ScalaUDF(func, IntegerType, a :: udf1 :: Nil)
val plan = Project(Alias(udf2, "")() :: Nil, testRelation)
comparePlans(plan.analyze, plan.analyze.analyze)
}

test("SPARK-11863 mixture of aliases and real columns in order by clause - tpcds 19,55,71") {
val a = testRelation2.output(0)
val c = testRelation2.output(2)
Expand Down
31 changes: 30 additions & 1 deletion sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.sql
import org.apache.spark.sql.api.java._
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.execution.command.ExplainCommand
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.test.SQLTestData._
import org.apache.spark.sql.types.{DataTypes, DoubleType}
Expand Down Expand Up @@ -324,4 +324,33 @@ class UDFSuite extends QueryTest with SharedSQLContext {
assert(outputStream.toString.contains("UDF:f(a._1 AS `_1`)"))
}
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule") {
val udf1 = udf({(x: Int, y: Int) => x + y})
val df = spark.range(0, 3).toDF("a")
.withColumn("b", udf1($"a", udf1($"a", lit(10))))
.withColumn("c", udf1($"a", lit(null)))
val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed

comparePlans(df.logicalPlan, plan)
checkAnswer(
df,
Seq(
Row(0, 10, null),
Row(1, 12, null),
Row(2, 14, null)))
}

test("SPARK-24891 Fix HandleNullInputsForUDF rule - with table") {
withTable("x") {
Seq((1, "2"), (2, "4")).toDF("a", "b").write.format("json").saveAsTable("x")
sql("insert into table x values(3, null)")
sql("insert into table x values(null, '4')")
spark.udf.register("f", (a: Int, b: String) => a + b)
val df = spark.sql("SELECT f(a, b) FROM x")
val plan = spark.sessionState.executePlan(df.logicalPlan).analyzed
comparePlans(df.logicalPlan, plan)
checkAnswer(df, Seq(Row("12"), Row("24"), Row("3null"), Row(null)))
}
}
}