Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
<artifact.scala.version>${scala.major.version}</artifact.scala.version>
<scala-maven-plugin.version>4.4.0</scala-maven-plugin.version>

<spark.version>3.2.0</spark.version>
<spark.version>3.2.1</spark.version>
</properties>

<name>deequ</name>
Expand Down Expand Up @@ -74,6 +74,14 @@
<version>${scala.version}</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.scala-lang/scala-reflect -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>${scala.version}</version>
</dependency>
Comment on lines +77 to +82
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to fix the following error happened on my local PC

*** RUN ABORTED *** (1 minute, 15 seconds)
  java.lang.VerifyError: class scala.tools.nsc.reporters.Reporter overrides final method echo.(Ljava/lang/String;)V
  at java.lang.ClassLoader.defineClass1(Native Method)
  at java.lang.ClassLoader.defineClass(ClassLoader.java:756)
  at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
  at java.net.URLClassLoader.defineClass(URLClassLoader.java:468)
  at java.net.URLClassLoader.access$100(URLClassLoader.java:74)
  at java.net.URLClassLoader$1.run(URLClassLoader.java:369)
  at java.net.URLClassLoader$1.run(URLClassLoader.java:363)
  at java.security.AccessController.doPrivileged(Native Method)
  at java.net.URLClassLoader.findClass(URLClassLoader.java:362)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
  at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest$.verificationFnFromConstraintSrc(ConstraintSuggestionRunnerTest.scala:291)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest.suggestHasDataTypeConstraintVerifyTest(ConstraintSuggestionRunnerTest.scala:259)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest.$anonfun$new$22(ConstraintSuggestionRunnerTest.scala:221)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest.$anonfun$new$22$adapted(ConstraintSuggestionRunnerTest.scala:215)
  at com.amazon.deequ.SparkContextSpec.withSparkSession(SparkContextSpec.scala:33)
  at com.amazon.deequ.SparkContextSpec.withSparkSession$(SparkContextSpec.scala:30)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest.withSparkSession(ConstraintSuggestionRunnerTest.scala:36)
  at com.amazon.deequ.suggestions.ConstraintSuggestionRunnerTest.$anonfun$new$21(ConstraintSuggestionRunnerTest.scala:215)
  at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
  at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85)
  at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83)
  at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
  at org.scalatest.Transformer.apply(Transformer.scala:22)
  at org.scalatest.Transformer.apply(Transformer.scala:20)
  at org.scalatest.wordspec.AnyWordSpecLike$$anon$3.apply(AnyWordSpecLike.scala:1076)
  at org.scalatest.TestSuite.withFixture(TestSuite.scala:196)
  at org.scalatest.TestSuite.withFixture$(TestSuite.scala:195)
  at org.scalatest.wordspec.AnyWordSpec.withFixture(AnyWordSpec.scala:1879)
  at org.scalatest.wordspec.AnyWordSpecLike.invokeWithFixture$1(AnyWordSpecLike.scala:1074)
  at org.scalatest.wordspec.AnyWordSpecLike.$anonfun$runTest$1(AnyWordSpecLike.scala:1086)
  at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306)
  at org.scalatest.wordspec.AnyWordSpecLike.runTest(AnyWordSpecLike.scala:1086)
  at org.scalatest.wordspec.AnyWordSpecLike.runTest$(AnyWordSpecLike.scala:1068)
  at org.scalatest.wordspec.AnyWordSpec.runTest(AnyWordSpec.scala:1879)
  at org.scalatest.wordspec.AnyWordSpecLike.$anonfun$runTests$1(AnyWordSpecLike.scala:1145)
  at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413)
  at scala.collection.immutable.List.foreach(List.scala:392)
  at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
  at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:390)
  at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:427)
  at scala.collection.immutable.List.foreach(List.scala:392)
  at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401)
  at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396)
  at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475)
  at org.scalatest.wordspec.AnyWordSpecLike.runTests(AnyWordSpecLike.scala:1145)
  at org.scalatest.wordspec.AnyWordSpecLike.runTests$(AnyWordSpecLike.scala:1144)
  at org.scalatest.wordspec.AnyWordSpec.runTests(AnyWordSpec.scala:1879)
  at org.scalatest.Suite.run(Suite.scala:1112)
  at org.scalatest.Suite.run$(Suite.scala:1094)
  at org.scalatest.wordspec.AnyWordSpec.org$scalatest$wordspec$AnyWordSpecLike$$super$run(AnyWordSpec.scala:1879)
  at org.scalatest.wordspec.AnyWordSpecLike.$anonfun$run$1(AnyWordSpecLike.scala:1190)
  at org.scalatest.SuperEngine.runImpl(Engine.scala:535)
  at org.scalatest.wordspec.AnyWordSpecLike.run(AnyWordSpecLike.scala:1190)
  at org.scalatest.wordspec.AnyWordSpecLike.run$(AnyWordSpecLike.scala:1188)
  at org.scalatest.wordspec.AnyWordSpec.run(AnyWordSpec.scala:1879)
  at org.scalatest.Suite.callExecuteOnSuite$1(Suite.scala:1175)
  at org.scalatest.Suite.$anonfun$runNestedSuites$1(Suite.scala:1222)
  at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
  at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
  at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
  at org.scalatest.Suite.runNestedSuites(Suite.scala:1220)
  at org.scalatest.Suite.runNestedSuites$(Suite.scala:1154)
  at org.scalatest.tools.DiscoverySuite.runNestedSuites(DiscoverySuite.scala:30)
  at org.scalatest.Suite.run(Suite.scala:1109)
  at org.scalatest.Suite.run$(Suite.scala:1094)
  at org.scalatest.tools.DiscoverySuite.run(DiscoverySuite.scala:30)
  at org.scalatest.tools.SuiteRunner.run(SuiteRunner.scala:45)
  at org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13(Runner.scala:1320)
  at org.scalatest.tools.Runner$.$anonfun$doRunRunRunDaDoRunRun$13$adapted(Runner.scala:1314)
  at scala.collection.immutable.List.foreach(List.scala:392)
  at org.scalatest.tools.Runner$.doRunRunRunDaDoRunRun(Runner.scala:1314)
  at org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24(Runner.scala:993)
  at org.scalatest.tools.Runner$.$anonfun$runOptionallyWithPassFailReporter$24$adapted(Runner.scala:971)
  at org.scalatest.tools.Runner$.withClassLoaderAndDispatchReporter(Runner.scala:1480)
  at org.scalatest.tools.Runner$.runOptionallyWithPassFailReporter(Runner.scala:971)
  at org.scalatest.tools.Runner$.main(Runner.scala:775)
  at org.scalatest.tools.Runner.main(Runner.scala)



<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.major.version}</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

* Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
Copy link
Contributor Author

@tanvn tanvn Feb 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatted using scalafmt to pass scalastyle check.

*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.catalyst.InternalRow
Expand All @@ -26,12 +25,13 @@ import org.apache.spark.sql.types._

/** Adjusted version of org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
* (github tag v2.2.0) */
private[sql] case class StatefulApproxQuantile(
child: Expression,
accuracyExpression: Expression,
override val mutableAggBufferOffset: Int,
override val inputAggBufferOffset: Int)
extends TypedImperativeAggregate[PercentileDigest] with ImplicitCastInputTypes with BinaryLike[Expression] {
private[sql] case class StatefulApproxQuantile(child: Expression,
accuracyExpression: Expression,
override val mutableAggBufferOffset: Int,
override val inputAggBufferOffset: Int)
extends TypedImperativeAggregate[PercentileDigest]
with ImplicitCastInputTypes
with BinaryLike[Expression] {

def this(child: Expression, accuracyExpression: Expression) = {
this(child, accuracyExpression, 0, 0)
Expand Down Expand Up @@ -111,11 +111,12 @@ private[sql] case class StatefulApproxQuantile(
}

override def left: Expression = child

override def right: Expression = accuracyExpression
// override def third: Expression = accuracyExpression

protected def withNewChildrenInternal(
newFirst: Expression, newSecond: Expression): StatefulApproxQuantile =
protected def withNewChildrenInternal(newFirst: Expression,
newSecond: Expression): StatefulApproxQuantile =
copy(child = newFirst, accuracyExpression = newSecond)

// protected def withNewChildrenInternal(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,8 @@ private[sql] class StatefulCorrelation(
val state = Seq(super.hashCode(), evaluateExpression)
state.map { _.hashCode() }.foldLeft(0) {(a, b) => 31 * a + b }
}

override protected def withNewChildrenInternal(newLeft: Expression,
newRight: Expression): StatefulCorrelation =
new StatefulCorrelation(newLeft, newRight, nullOnDivideByZero)
Comment on lines +54 to +56
Copy link
Contributor Author

@tanvn tanvn Feb 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to fix a fail on baseCheck.hasCorrelation
https://github.com/awslabs/deequ/blob/master/src/test/scala/com/amazon/deequ/checks/CheckTest.scala#L577

If we do not provide this withNewChildrenInternal, the method of Corr will be used, which lead to incorrect results.

}
1 change: 1 addition & 0 deletions src/test/scala/com/amazon/deequ/SparkContextSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ trait SparkContextSpec {
.appName("test")
.config("spark.ui.enabled", "false")
.config("spark.sql.shuffle.partitions", 2.toString)
.config("spark.sql.adaptive.enabled", value = false)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to fix failed tests when verifying submitted job count from SparkSessionStats
Because from Spark 3.2, adaptive query execution is enabled by default, the plan Spark creates to execute tasks has changed largely.
Here we disable Adaptive query execution to make Spark behave the same as 3.1
This is to fix failed tests like below:
https://github.com/awslabs/deequ/blob/master/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala#L63
(There were several failed tasks like this one, the actual number of jobs that have been submitted is 5 when AQE is enabled, and is 3 when AQE is disabled)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The plan is explained on master branch (spark 3.1)

== Parsed Logical Plan ==
'Aggregate [sum(cast(isnotnull('item) as int)) AS sum(CAST((item IS NOT NULL) AS INT))#915, count(1) AS count(1)#916L, stateful_approx_count_distinct('item, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype('item, org.apache.spark.sql.StatefulDataType@4ba1c1a2, 0, 0) AS statefuldatatype(item)#1032, sum(cast(isnotnull('att1) as int)) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033, count(1) AS count(1)#1034L, stateful_approx_count_distinct('att1, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(cast(isnotnull('att2) as int)) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140, count(1) AS count(1)#1141L, stateful_approx_count_distinct('att2, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(cast(isnotnull('att3) as int)) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247, count(1) AS count(1)#1248L, stateful_approx_count_distinct('att3, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- Project [_1#4 AS item#13, _2#5 AS att1#14, _3#6 AS att2#15, _4#7 AS att3#16]
   +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Analyzed Logical Plan ==
sum(CAST((item IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(item): binary, statefuldatatype(item): binary, sum(CAST((att1 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att1): binary, sum(CAST((att2 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att2): binary, sum(CAST((att3 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att3): binary, count(1): bigint
Aggregate [sum(cast(cast(isnotnull(item#13) as int) as bigint)) AS sum(CAST((item IS NOT NULL) AS INT))#915L, count(1) AS count(1)#916L, stateful_approx_count_distinct(item#13, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@4ba1c1a2, 0, 0) AS statefuldatatype(item)#1032, sum(cast(cast(isnotnull(att1#14) as int) as bigint)) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1) AS count(1)#1034L, stateful_approx_count_distinct(att1#14, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(cast(cast(isnotnull(att2#15) as int) as bigint)) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1) AS count(1)#1141L, stateful_approx_count_distinct(att2#15, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(cast(cast(isnotnull(att3#16) as int) as bigint)) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1) AS count(1)#1248L, stateful_approx_count_distinct(att3#16, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- Project [_1#4 AS item#13, _2#5 AS att1#14, _3#6 AS att2#15, _4#7 AS att3#16]
   +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Optimized Logical Plan ==
Aggregate [sum(cast(cast(isnotnull(item#13) as int) as bigint)) AS sum(CAST((item IS NOT NULL) AS INT))#915L, count(1) AS count(1)#916L, stateful_approx_count_distinct(item#13, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@4ba1c1a2, 0, 0) AS statefuldatatype(item)#1032, sum(1) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1) AS count(1)#1034L, stateful_approx_count_distinct(att1#14, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(1) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1) AS count(1)#1141L, stateful_approx_count_distinct(att2#15, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(1) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1) AS count(1)#1248L, stateful_approx_count_distinct(att3#16, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- LocalRelation [item#13, att1#14, att2#15, att3#16]

== Physical Plan ==
HashAggregate(keys=[], functions=[sum(cast(cast(isnotnull(item#13) as int) as bigint)), count(1), stateful_approx_count_distinct(item#13, 0, 0), statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@4ba1c1a2, 0, 0), sum(1), stateful_approx_count_distinct(att1#14, 0, 0), stateful_approx_count_distinct(att2#15, 0, 0), stateful_approx_count_distinct(att3#16, 0, 0)], output=[sum(CAST((item IS NOT NULL) AS INT))#915L, count(1)#916L, stateful_approx_count_distinct(item)#1021, statefuldatatype(item)#1032, sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1)#1034L, stateful_approx_count_distinct(att1)#1139, sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1)#1141L, stateful_approx_count_distinct(att2)#1246, sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1)#1248L, stateful_approx_count_distinct(att3)#1353, count(1)#1354L])
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#10]
   +- HashAggregate(keys=[], functions=[partial_sum(cast(cast(isnotnull(item#13) as int) as bigint)), partial_count(1), partial_stateful_approx_count_distinct(item#13, 0, 0), partial_statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@4ba1c1a2, 0, 0), partial_sum(1), partial_stateful_approx_count_distinct(att1#14, 0, 0), partial_stateful_approx_count_distinct(att2#15, 0, 0), partial_stateful_approx_count_distinct(att3#16, 0, 0)], output=[sum#2224L, count#2225L, MS[0]#1407L, MS[1]#1408L, MS[2]#1409L, MS[3]#1410L, MS[4]#1411L, MS[5]#1412L, MS[6]#1413L, MS[7]#1414L, MS[8]#1415L, MS[9]#1416L, MS[10]#1417L, MS[11]#1418L, MS[12]#1419L, MS[13]#1420L, MS[14]#1421L, MS[15]#1422L, MS[16]#1423L, MS[17]#1424L, MS[18]#1425L, MS[19]#1426L, MS[20]#1427L, MS[21]#1428L, ... 192 more fields])
      +- LocalTableScan [item#13, att1#14, att2#15, att3#16]

The plan is explained on this branch (Spark 3.2.1)

== Parsed Logical Plan ==
'Aggregate [sum(cast(isnotnull('item) as int)) AS sum(CAST((item IS NOT NULL) AS INT))#915, count(1) AS count(1)#916L, stateful_approx_count_distinct('item, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype('item, org.apache.spark.sql.StatefulDataType@253b1cbd, 0, 0, None) AS statefuldatatype(item)#1032, sum(cast(isnotnull('att1) as int)) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033, count(1) AS count(1)#1034L, stateful_approx_count_distinct('att1, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(cast(isnotnull('att2) as int)) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140, count(1) AS count(1)#1141L, stateful_approx_count_distinct('att2, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(cast(isnotnull('att3) as int)) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247, count(1) AS count(1)#1248L, stateful_approx_count_distinct('att3, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- Project [_1#4 AS item#13, _2#5 AS att1#14, _3#6 AS att2#15, _4#7 AS att3#16]
   +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Analyzed Logical Plan ==
sum(CAST((item IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(item): binary, statefuldatatype(item): binary, sum(CAST((att1 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att1): binary, sum(CAST((att2 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att2): binary, sum(CAST((att3 IS NOT NULL) AS INT)): bigint, count(1): bigint, stateful_approx_count_distinct(att3): binary, count(1): bigint
Aggregate [sum(cast(isnotnull(item#13) as int)) AS sum(CAST((item IS NOT NULL) AS INT))#915L, count(1) AS count(1)#916L, stateful_approx_count_distinct(item#13, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@253b1cbd, 0, 0, None) AS statefuldatatype(item)#1032, sum(cast(isnotnull(att1#14) as int)) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1) AS count(1)#1034L, stateful_approx_count_distinct(att1#14, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(cast(isnotnull(att2#15) as int)) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1) AS count(1)#1141L, stateful_approx_count_distinct(att2#15, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(cast(isnotnull(att3#16) as int)) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1) AS count(1)#1248L, stateful_approx_count_distinct(att3#16, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- Project [_1#4 AS item#13, _2#5 AS att1#14, _3#6 AS att2#15, _4#7 AS att3#16]
   +- LocalRelation [_1#4, _2#5, _3#6, _4#7]

== Optimized Logical Plan ==
Aggregate [sum(cast(isnotnull(item#13) as int)) AS sum(CAST((item IS NOT NULL) AS INT))#915L, count(1) AS count(1)#916L, stateful_approx_count_distinct(item#13, 0, 0) AS stateful_approx_count_distinct(item)#1021, statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@253b1cbd, 0, 0, None) AS statefuldatatype(item)#1032, sum(1) AS sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1) AS count(1)#1034L, stateful_approx_count_distinct(att1#14, 0, 0) AS stateful_approx_count_distinct(att1)#1139, sum(1) AS sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1) AS count(1)#1141L, stateful_approx_count_distinct(att2#15, 0, 0) AS stateful_approx_count_distinct(att2)#1246, sum(1) AS sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1) AS count(1)#1248L, stateful_approx_count_distinct(att3#16, 0, 0) AS stateful_approx_count_distinct(att3)#1353, count(1) AS count(1)#1354L]
+- LocalRelation [item#13, att1#14, att2#15, att3#16]

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[sum(cast(isnotnull(item#13) as int)), count(1), stateful_approx_count_distinct(item#13, 0, 0), statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@253b1cbd, 0, 0, None), sum(1), stateful_approx_count_distinct(att1#14, 0, 0), stateful_approx_count_distinct(att2#15, 0, 0), stateful_approx_count_distinct(att3#16, 0, 0)], output=[sum(CAST((item IS NOT NULL) AS INT))#915L, count(1)#916L, stateful_approx_count_distinct(item)#1021, statefuldatatype(item)#1032, sum(CAST((att1 IS NOT NULL) AS INT))#1033L, count(1)#1034L, stateful_approx_count_distinct(att1)#1139, sum(CAST((att2 IS NOT NULL) AS INT))#1140L, count(1)#1141L, stateful_approx_count_distinct(att2)#1246, sum(CAST((att3 IS NOT NULL) AS INT))#1247L, count(1)#1248L, stateful_approx_count_distinct(att3)#1353, count(1)#1354L])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#11]
      +- HashAggregate(keys=[], functions=[partial_sum(cast(isnotnull(item#13) as int)), partial_count(1), partial_stateful_approx_count_distinct(item#13, 0, 0), partial_statefuldatatype(item#13, org.apache.spark.sql.StatefulDataType@253b1cbd, 0, 0, None), partial_sum(1), partial_stateful_approx_count_distinct(att1#14, 0, 0), partial_stateful_approx_count_distinct(att2#15, 0, 0), partial_stateful_approx_count_distinct(att3#16, 0, 0)], output=[sum#2224L, count#2225L, MS[0]#1407L, MS[1]#1408L, MS[2]#1409L, MS[3]#1410L, MS[4]#1411L, MS[5]#1412L, MS[6]#1413L, MS[7]#1414L, MS[8]#1415L, MS[9]#1416L, MS[10]#1417L, MS[11]#1418L, MS[12]#1419L, MS[13]#1420L, MS[14]#1421L, MS[15]#1422L, MS[16]#1423L, MS[17]#1424L, MS[18]#1425L, MS[19]#1426L, MS[20]#1427L, MS[21]#1428L, ... 192 more fields])
         +- LocalTableScan [item#13, att1#14, att2#15, att3#16]

.getOrCreate()
session.sparkContext.setCheckpointDir(System.getProperty("java.io.tmpdir"))
session
Expand Down