apache · huaxingao · Dec 4, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -36,6 +36,11 @@ object MimaExcludes {
 
   // Exclude rules for 3.4.x from 3.3.0
   lazy val v34excludes = defaultExcludes ++ Seq(
+    // [SPARK-41378] Support Column Stats in DS v2
+    ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.connector.read.Statistics"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.connector.read.SupportsReportStatistics.estimateStatistics"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.connector.read.SupportsReportStatistics.estimateStatistics"),
+
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.recommendation.ALS.checkedCast"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.recommendation.ALSModel.checkedCast"),
 

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportStatistics.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.connector.read;
 
 import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.connector.read.stats.Statistics;
 
 /**
  * A mix in interface for {@link Scan}. Data sources can implement this interface to

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/ColumnStatistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/ColumnStatistics.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.stats;
+
+import org.apache.spark.annotation.Evolving;
+import java.math.BigInteger;
+import java.util.Optional;
+import java.util.OptionalLong;
+
+/**
+ * An interface to represent column statistics, which is part of
+ * {@link Statistics}.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface ColumnStatistics {
+  default Optional<BigInteger> distinctCount() {
+    return Optional.empty();
+  }
+
+  default Optional<Object> min() {
+    return Optional.empty();
+  }
+
+  default Optional<Object> max() {
+    return Optional.empty();
+  }
+
+  default Optional<BigInteger> nullCount() {
+    return Optional.empty();
+  }
+
+  default OptionalLong avgLen() {
+    return OptionalLong.empty();
+  }
+
+  default OptionalLong maxLen() {
+    return OptionalLong.empty();
+  }
+
+  default Optional<Histogram> histogram() {
+    return Optional.empty();
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/Histogram.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/Histogram.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.stats;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * An interface to represent an equi-height histogram, which is a part of
+ * {@link ColumnStatistics}. Equi-height histogram represents the distribution of
+ * a column's values by a sequence of bins.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface Histogram {
+  double height();
+  HistogramBin[] bins();
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/HistogramBin.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/stats/HistogramBin.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.stats;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * An interface to represent a bin in an equi-height histogram.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface HistogramBin {
+  double lo();
+  double hi();
+  long ndv();
+}
diff --git a/.../spark/sql/connector/read/Statistics.java → .../sql/connector/read/stats/Statistics.java b/.../spark/sql/connector/read/Statistics.java → .../sql/connector/read/stats/Statistics.java
@@ -15,11 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.connector.read;
+package org.apache.spark.sql.connector.read.stats;
 
+import java.util.HashMap;
+import java.util.Optional;
 import java.util.OptionalLong;
 
 import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.connector.expressions.NamedReference;
+import org.apache.spark.sql.connector.read.SupportsReportStatistics;
 
 /**
  * An interface to represent statistics for a data source, which is returned by
@@ -31,4 +35,7 @@
 public interface Statistics {
   OptionalLong sizeInBytes();
   OptionalLong numRows();
+  default Optional<HashMap<NamedReference, ColumnStatistics>> columnStats() {
+    return Optional.empty();
+  }
 }
diff --git a/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, SortOrder}
-import org.apache.spark.sql.catalyst.plans.logical.{ExposesMetadataColumns, LeafNode, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, ExposesMetadataColumns, Histogram, HistogramBin, LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.{truncatedString, CharVarcharUtils}
 import org.apache.spark.sql.connector.catalog.{CatalogPlugin, FunctionCatalog, Identifier, MetadataColumn, SupportsMetadataColumns, Table, TableCapability}
-import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.{Scan, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.stats.{Statistics => V2Statistics}
 import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.Utils
@@ -91,7 +92,7 @@ case class DataSourceV2Relation(
       table.asReadable.newScanBuilder(options).build() match {
         case r: SupportsReportStatistics =>
           val statistics = r.estimateStatistics()
-          DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+          DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
         case _ =>
           Statistics(sizeInBytes = conf.defaultSizeInBytes)
       }
@@ -142,7 +143,7 @@ case class DataSourceV2ScanRelation(
     scan match {
       case r: SupportsReportStatistics =>
         val statistics = r.estimateStatistics()
-        DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+        DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
       case _ =>
         Statistics(sizeInBytes = conf.defaultSizeInBytes)
     }
@@ -173,7 +174,7 @@ case class StreamingDataSourceV2Relation(
   override def computeStats(): Statistics = scan match {
     case r: SupportsReportStatistics =>
       val statistics = r.estimateStatistics()
-      DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+      DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
     case _ =>
       Statistics(sizeInBytes = conf.defaultSizeInBytes)
   }
@@ -214,14 +215,52 @@ object DataSourceV2Relation {
   def transformV2Stats(
       v2Statistics: V2Statistics,
       defaultRowCount: Option[BigInt],
-      defaultSizeInBytes: Long): Statistics = {
+      defaultSizeInBytes: Long,
+      output: Seq[Attribute] = Seq.empty): Statistics = {
     val numRows: Option[BigInt] = if (v2Statistics.numRows().isPresent) {
       Some(v2Statistics.numRows().getAsLong)
     } else {
       defaultRowCount
     }
+
+    var colStats: Seq[(Attribute, ColumnStat)] = Seq.empty[(Attribute, ColumnStat)]
+    if (v2Statistics.columnStats().isPresent) {
+      val v2ColumnStat = v2Statistics.columnStats().get()
+      val keys = v2ColumnStat.keySet()
+
+      keys.forEach(key => {
+        val colStat = v2ColumnStat.get(key)
+        val distinct: Option[BigInt] =
+          if (colStat.distinctCount().isPresent) Some(colStat.distinctCount().get) else None
+        val min: Option[Any] = if (colStat.min().isPresent) Some(colStat.min().get) else None
+        val max: Option[Any] = if (colStat.max().isPresent) Some(colStat.max().get) else None
+        val nullCount: Option[BigInt] =
+          if (colStat.nullCount().isPresent) Some(colStat.nullCount().get()) else None
+        val avgLen: Option[Long] =
+          if (colStat.avgLen().isPresent) Some(colStat.avgLen().getAsLong) else None
+        val maxLen: Option[Long] =
+          if (colStat.maxLen().isPresent) Some(colStat.maxLen().getAsLong) else None
+        val histogram = if (colStat.histogram().isPresent) {
+          val v2Histogram = colStat.histogram().get()
+          val bins = v2Histogram.bins()
+          Some(Histogram(v2Histogram.height(),
+            bins.map(bin => HistogramBin(bin.lo, bin.hi, bin.ndv))))
+        } else {
+          None
+        }
+
+        val catalystColStat = ColumnStat(distinct, min, max, nullCount, avgLen, maxLen, histogram)
+
+        output.foreach(attribute => {
+          if (attribute.name.equals(key.describe())) {
+            colStats = colStats :+ (attribute -> catalystColStat)
+          }
+        })
+      })
+    }
     Statistics(
       sizeInBytes = v2Statistics.sizeInBytes().orElse(defaultSizeInBytes),
-      rowCount = numRows)
+      rowCount = numRows,
+      attributeStats = AttributeMap(colStats))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.connector.catalog
 
+import java.math.BigInteger
 import java.time.{Instant, ZoneId}
 import java.time.temporal.ChronoUnit
 import java.util
-import java.util.OptionalLong
+import java.util.{HashMap, Optional, OptionalLong}
 
 import scala.collection.mutable
 
@@ -34,6 +35,7 @@ import org.apache.spark.sql.connector.expressions._
 import org.apache.spark.sql.connector.metric.{CustomMetric, CustomTaskMetric}
 import org.apache.spark.sql.connector.read._
 import org.apache.spark.sql.connector.read.partitioning.{KeyGroupedPartitioning, Partitioning, UnknownPartitioning}
+import org.apache.spark.sql.connector.read.stats.{ColumnStatistics, Histogram, HistogramBin, Statistics}
 import org.apache.spark.sql.connector.write._
 import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite}
 import org.apache.spark.sql.internal.connector.SupportsStreamingUpdateAsAppend
@@ -273,7 +275,23 @@ abstract class InMemoryBaseTable(
     }
   }
 
-  case class InMemoryStats(sizeInBytes: OptionalLong, numRows: OptionalLong) extends Statistics
+  case class InMemoryStats(
+      sizeInBytes: OptionalLong,
+      numRows: OptionalLong,
+      override val columnStats: Optional[HashMap[NamedReference, ColumnStatistics]])
+    extends Statistics
+  case class InMemoryColumnStats (
+      override val distinctCount: Optional[BigInteger],
+      override val min: Optional[AnyRef],
+      override val max: Optional[AnyRef],
+      override val nullCount: Optional[BigInteger],
+      override val avgLen: OptionalLong,
+      override val maxLen: OptionalLong,
+      override val histogram: Optional[Histogram]) extends ColumnStatistics
+
+  case class InMemoryHistogramBin(lo: Double, hi: Double, ndv: Long) extends HistogramBin
+
+  case class InMemoryHistogram(height: Double, bins: Array[HistogramBin]) extends Histogram
 
   abstract class BatchScanBaseClass(
       var data: Seq[InputPartition],
@@ -285,7 +303,7 @@ abstract class InMemoryBaseTable(
 
     override def estimateStatistics(): Statistics = {
       if (data.isEmpty) {
-        return InMemoryStats(OptionalLong.of(0L), OptionalLong.of(0L))
+        return InMemoryStats(OptionalLong.of(0L), OptionalLong.of(0L), Optional.empty())
       }
 
       val inputPartitions = data.map(_.asInstanceOf[BufferedRows])
@@ -294,7 +312,30 @@ abstract class InMemoryBaseTable(
       val objectHeaderSizeInBytes = 12L
       val rowSizeInBytes = objectHeaderSizeInBytes + schema.defaultSize
       val sizeInBytes = numRows * rowSizeInBytes
-      InMemoryStats(OptionalLong.of(sizeInBytes), OptionalLong.of(numRows))
+
+      val map = new util.HashMap[NamedReference, ColumnStatistics]()
+      val colNames = readSchema.fields.map(_.name)
+      for (col <- colNames) {
+        val fieldReference = FieldReference(col)
+        // put some fake data for testing only
+        val bin1 = InMemoryHistogramBin(1, 2, 5L)
+        val bin2 = InMemoryHistogramBin(3, 4, 5L)
+        val bin3 = InMemoryHistogramBin(5, 6, 5L)
+        val bin4 = InMemoryHistogramBin(7, 8, 5L)
+        val bin5 = InMemoryHistogramBin(9, 10, 5L)
+        val colStats = InMemoryColumnStats(
+          Optional.of[BigInteger](BigInteger.valueOf(5)),
+          Optional.of[AnyRef](Integer.valueOf(0)),
+          Optional.of[AnyRef](Integer.valueOf(5)),
+          Optional.of[BigInteger](BigInteger.valueOf(0)),
+          OptionalLong.of(111L),
+          OptionalLong.of(1111L),
+          Optional.of[Histogram](InMemoryHistogram(5, Array(bin1, bin2, bin3, bin4, bin5)))
+        )
+        map.put(fieldReference, colStats)
+      }
+
+      InMemoryStats(OptionalLong.of(sizeInBytes), OptionalLong.of(numRows), Optional.of(map))
     }
 
     override def outputPartitioning(): Partitioning = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, ExpressionSet}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.connector.read.{Batch, InputPartition, Scan, Statistics, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.{Batch, InputPartition, Scan, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.stats.Statistics
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.PartitionedFileUtil
 import org.apache.spark.sql.execution.datasources._

diff --git a/...ore/src/test/java/test/org/apache/spark/sql/connector/JavaReportStatisticsDataSource.java b/...ore/src/test/java/test/org/apache/spark/sql/connector/JavaReportStatisticsDataSource.java
@@ -23,7 +23,7 @@
 import org.apache.spark.sql.connector.catalog.Table;
 import org.apache.spark.sql.connector.read.InputPartition;
 import org.apache.spark.sql.connector.read.ScanBuilder;
-import org.apache.spark.sql.connector.read.Statistics;
+import org.apache.spark.sql.connector.read.stats.Statistics;
 import org.apache.spark.sql.connector.read.SupportsReportStatistics;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;