apache · leanken-zz · Oct 9, 2020 · Oct 10, 2020 · Oct 10, 2020 · Oct 11, 2020
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2147,7 +2147,7 @@ test_that("group by, agg functions", {
   df3 <- agg(gd, age = "stddev")
   expect_is(df3, "SparkDataFrame")
   df3_local <- collect(df3)
-  expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
+  expect_true(is.na(df3_local[df3_local$name == "Andy", ][1, 2]))
 
   df4 <- agg(gd, sumAge = sum(df$age))
   expect_is(df4, "SparkDataFrame")
@@ -2178,7 +2178,7 @@ test_that("group by, agg functions", {
   df7 <- agg(gd2, value = "stddev")
   df7_local <- collect(df7)
   expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
-  expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
+  expect_true(is.na(df7_local[df7_local$name == "ID2", ][1, 2]))
 
   mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
                   "{\"name\":\"Andy\", \"age\":30}",

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -24,6 +24,8 @@ license: |
 
 ## Upgrading from Spark SQL 3.0 to 3.1
 
+  - In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`.
+
   - In Spark 3.1, grouping_id() returns long values. In Spark version 3.0 and earlier, this function returns int values. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.integerGroupingId` to `true`.
 
   - In Spark 3.1, SQL UI data adopts the `formatted` mode for the query plan explain results. To restore the behavior before Spark 3.1, you can set `spark.sql.ui.explainMode` to `extended`.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -450,14 +450,20 @@ object TypeCoercion {
       case Abs(e @ StringType()) => Abs(Cast(e, DoubleType))
       case Sum(e @ StringType()) => Sum(Cast(e, DoubleType))
       case Average(e @ StringType()) => Average(Cast(e, DoubleType))
-      case StddevPop(e @ StringType()) => StddevPop(Cast(e, DoubleType))
-      case StddevSamp(e @ StringType()) => StddevSamp(Cast(e, DoubleType))
+      case s @ StddevPop(e @ StringType(), _) =>
+        s.withNewChildren(Seq(Cast(e, DoubleType)))
+      case s @ StddevSamp(e @ StringType(), _) =>
+        s.withNewChildren(Seq(Cast(e, DoubleType)))
       case UnaryMinus(e @ StringType()) => UnaryMinus(Cast(e, DoubleType))
       case UnaryPositive(e @ StringType()) => UnaryPositive(Cast(e, DoubleType))
-      case VariancePop(e @ StringType()) => VariancePop(Cast(e, DoubleType))
-      case VarianceSamp(e @ StringType()) => VarianceSamp(Cast(e, DoubleType))
-      case Skewness(e @ StringType()) => Skewness(Cast(e, DoubleType))
-      case Kurtosis(e @ StringType()) => Kurtosis(Cast(e, DoubleType))
+      case v @ VariancePop(e @ StringType(), _) =>
+        v.withNewChildren(Seq(Cast(e, DoubleType)))
+      case v @ VarianceSamp(e @ StringType(), _) =>
+        v.withNewChildren(Seq(Cast(e, DoubleType)))
+      case s @ Skewness(e @ StringType(), _) =>
+        s.withNewChildren(Seq(Cast(e, DoubleType)))
+      case k @ Kurtosis(e @ StringType(), _) =>
+        k.withNewChildren(Seq(Cast(e, DoubleType)))
     }
   }
 

diff --git a/...src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/...src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 /**
@@ -43,7 +44,7 @@ import org.apache.spark.sql.types._
  *
  * @param child to compute central moments of.
  */
-abstract class CentralMomentAgg(child: Expression)
+abstract class CentralMomentAgg(child: Expression, nullOnDivideByZero: Boolean)
   extends DeclarativeAggregate with ImplicitCastInputTypes {
 
   /**
@@ -62,6 +63,13 @@ abstract class CentralMomentAgg(child: Expression)
   protected val m3 = AttributeReference("m3", DoubleType, nullable = false)()
   protected val m4 = AttributeReference("m4", DoubleType, nullable = false)()
 
+  protected def divideByZeroEvalResult: Expression = {
+    if (nullOnDivideByZero) Literal.create(null, DoubleType) else Double.NaN
+  }
+
+  override def stringArgs: Iterator[Any] =
+    super.stringArgs.filter(_.isInstanceOf[Expression])
+
   private def trimHigherOrder[T](expressions: Seq[T]) = expressions.take(momentOrder + 1)
 
   override val aggBufferAttributes = trimHigherOrder(Seq(n, avg, m2, m3, m4))
@@ -145,7 +153,12 @@ abstract class CentralMomentAgg(child: Expression)
   group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
-case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
+case class StddevPop(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override protected def momentOrder = 2
 
@@ -168,13 +181,18 @@ case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
   group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
-case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
+case class StddevSamp(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override protected def momentOrder = 2
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(n === 1.0, Double.NaN, sqrt(m2 / (n - 1.0))))
+      If(n === 1.0, divideByZeroEvalResult, sqrt(m2 / (n - 1.0))))
   }
 
   override def prettyName: String =
@@ -191,7 +209,12 @@ case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
   """,
   group = "agg_funcs",
   since = "1.6.0")
-case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
+case class VariancePop(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override protected def momentOrder = 2
 
@@ -212,13 +235,18 @@ case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
   """,
   group = "agg_funcs",
   since = "1.6.0")
-case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
+case class VarianceSamp(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override protected def momentOrder = 2
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(n === 1.0, Double.NaN, m2 / (n - 1.0)))
+      If(n === 1.0, divideByZeroEvalResult, m2 / (n - 1.0)))
   }
 
   override def prettyName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("var_samp")
@@ -235,15 +263,20 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
   """,
   group = "agg_funcs",
   since = "1.6.0")
-case class Skewness(child: Expression) extends CentralMomentAgg(child) {
+case class Skewness(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override def prettyName: String = "skewness"
 
   override protected def momentOrder = 3
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(m2 === 0.0, Double.NaN, sqrt(n) * m3 / sqrt(m2 * m2 * m2)))
+      If(m2 === 0.0, divideByZeroEvalResult, sqrt(n) * m3 / sqrt(m2 * m2 * m2)))
   }
 }
 
@@ -258,13 +291,18 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) {
   """,
   group = "agg_funcs",
   since = "1.6.0")
-case class Kurtosis(child: Expression) extends CentralMomentAgg(child) {
+case class Kurtosis(
+    child: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends CentralMomentAgg(child, nullOnDivideByZero) {
+
+  def this(child: Expression) = this(child, !SQLConf.get.legacyStatisticalAggregate)
 
   override protected def momentOrder = 4
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(m2 === 0.0, Double.NaN, n * m4 / (m2 * m2) - 3.0))
+      If(m2 === 0.0, divideByZeroEvalResult, n * m4 / (m2 * m2) - 3.0))
   }
 
   override def prettyName: String = "kurtosis"

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 /**
@@ -28,7 +29,7 @@ import org.apache.spark.sql.types._
  * Definition of Pearson correlation can be found at
  * http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
  */
-abstract class PearsonCorrelation(x: Expression, y: Expression)
+abstract class PearsonCorrelation(x: Expression, y: Expression, nullOnDivideByZero: Boolean)
   extends DeclarativeAggregate with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = Seq(x, y)
@@ -43,6 +44,13 @@ abstract class PearsonCorrelation(x: Expression, y: Expression)
   protected val xMk = AttributeReference("xMk", DoubleType, nullable = false)()
   protected val yMk = AttributeReference("yMk", DoubleType, nullable = false)()
 
+  protected def divideByZeroEvalResult: Expression = {
+    if (nullOnDivideByZero) Literal.create(null, DoubleType) else Double.NaN
+  }
+
+  override def stringArgs: Iterator[Any] =
+    super.stringArgs.filter(_.isInstanceOf[Expression])
+
   override val aggBufferAttributes: Seq[AttributeReference] = Seq(n, xAvg, yAvg, ck, xMk, yMk)
 
   override val initialValues: Seq[Expression] = Array.fill(6)(Literal(0.0))
@@ -102,12 +110,18 @@ abstract class PearsonCorrelation(x: Expression, y: Expression)
   group = "agg_funcs",
   since = "1.6.0")
 // scalastyle:on line.size.limit
-case class Corr(x: Expression, y: Expression)
-  extends PearsonCorrelation(x, y) {
+case class Corr(
+    x: Expression,
+    y: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends PearsonCorrelation(x, y, nullOnDivideByZero) {
+
+  def this(x: Expression, y: Expression) =
+    this(x, y, !SQLConf.get.legacyStatisticalAggregate)
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(n === 1.0, Double.NaN, ck / sqrt(xMk * yMk)))
+      If(n === 1.0, divideByZeroEvalResult, ck / sqrt(xMk * yMk)))
   }
 
   override def prettyName: String = "corr"

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
@@ -19,13 +19,14 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 /**
  * Compute the covariance between two expressions.
  * When applied on empty data (i.e., count is zero), it returns NULL.
  */
-abstract class Covariance(x: Expression, y: Expression)
+abstract class Covariance(x: Expression, y: Expression, nullOnDivideByZero: Boolean)
   extends DeclarativeAggregate with ImplicitCastInputTypes {
 
   override def children: Seq[Expression] = Seq(x, y)
@@ -38,6 +39,13 @@ abstract class Covariance(x: Expression, y: Expression)
   protected val yAvg = AttributeReference("yAvg", DoubleType, nullable = false)()
   protected val ck = AttributeReference("ck", DoubleType, nullable = false)()
 
+  protected def divideByZeroEvalResult: Expression = {
+    if (nullOnDivideByZero) Literal.create(null, DoubleType) else Double.NaN
+  }
+
+  override def stringArgs: Iterator[Any] =
+    super.stringArgs.filter(_.isInstanceOf[Expression])
+
   override val aggBufferAttributes: Seq[AttributeReference] = Seq(n, xAvg, yAvg, ck)
 
   override val initialValues: Seq[Expression] = Array.fill(4)(Literal(0.0))
@@ -88,7 +96,15 @@ abstract class Covariance(x: Expression, y: Expression)
   """,
   group = "agg_funcs",
   since = "2.0.0")
-case class CovPopulation(left: Expression, right: Expression) extends Covariance(left, right) {
+case class CovPopulation(
+    left: Expression,
+    right: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends Covariance(left, right, nullOnDivideByZero) {
+
+  def this(left: Expression, right: Expression) =
+    this(left, right, !SQLConf.get.legacyStatisticalAggregate)
+
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType), ck / n)
   }
@@ -105,10 +121,18 @@ case class CovPopulation(left: Expression, right: Expression) extends Covariance
   """,
   group = "agg_funcs",
   since = "2.0.0")
-case class CovSample(left: Expression, right: Expression) extends Covariance(left, right) {
+case class CovSample(
+    left: Expression,
+    right: Expression,
+    nullOnDivideByZero: Boolean = !SQLConf.get.legacyStatisticalAggregate)
+  extends Covariance(left, right, nullOnDivideByZero) {
+
+  def this(left: Expression, right: Expression) =
+    this(left, right, !SQLConf.get.legacyStatisticalAggregate)
+
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(n === 1.0, Double.NaN, ck / (n - 1.0)))
+      If(n === 1.0, divideByZeroEvalResult, ck / (n - 1.0)))
   }
   override def prettyName: String = "covar_samp"
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2342,6 +2342,16 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_STATISTICAL_AGGREGATE =
+    buildConf("spark.sql.legacy.statisticalAggregate")
+      .internal()
+      .doc("When set to true, statistical aggregate function returns Double.NaN " +
+        "if divide by zero occurred during expression evaluation, otherwise, it returns null. " +
+        "Before version 3.1.0, it returns NaN in divideByZero case by default.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val TRUNCATE_TABLE_IGNORE_PERMISSION_ACL =
     buildConf("spark.sql.truncateTable.ignorePermissionAcl.enabled")
       .internal()
@@ -3364,6 +3374,8 @@ class SQLConf extends Serializable with Logging {
   def allowNegativeScaleOfDecimalEnabled: Boolean =
     getConf(SQLConf.LEGACY_ALLOW_NEGATIVE_SCALE_OF_DECIMAL_ENABLED)
 
+  def legacyStatisticalAggregate: Boolean = getConf(SQLConf.LEGACY_STATISTICAL_AGGREGATE)
+
   def truncateTableIgnorePermissionAcl: Boolean =
     getConf(SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL)
 

diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
@@ -143,15 +143,15 @@ SELECT var_pop(1.0), var_samp(2.0)
 -- !query schema
 struct<var_pop(CAST(1.0 AS DOUBLE)):double,var_samp(CAST(2.0 AS DOUBLE)):double>
 -- !query output
-0.0	NaN
+0.0	NULL
 
 
 -- !query
 SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0)))
 -- !query schema
 struct<stddev_pop(CAST(CAST(3.0 AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(4.0 AS DECIMAL(38,0)) AS DOUBLE)):double>
 -- !query output
-0.0	NaN
+0.0	NULL
 
 
 -- !query