apache · huaxingao · Dec 4, 2022 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/Statistics.java
@@ -17,9 +17,13 @@
 
 package org.apache.spark.sql.connector.read;
 
+import java.util.HashMap;
+import java.util.Optional;
 import java.util.OptionalLong;
 
 import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.connector.expressions.NamedReference;
+import org.apache.spark.sql.connector.read.colstats.ColumnStatistics;
 
 /**
  * An interface to represent statistics for a data source, which is returned by
@@ -31,4 +35,7 @@
 public interface Statistics {
   OptionalLong sizeInBytes();
   OptionalLong numRows();
+  default Optional<HashMap<NamedReference, ColumnStatistics>> columnStats() {
+    return Optional.empty();
+  }
 }
diff --git a/...catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/ColumnStatistics.java b/...catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/ColumnStatistics.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.colstats;
+
+import org.apache.spark.annotation.Evolving;
+import java.math.BigInteger;
+import java.util.Optional;
+import java.util.OptionalLong;
+
+/**
+ * An interface to represent column statistics, which is part of
+ * {@link Statistics}.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface ColumnStatistics {
+  default Optional<BigInteger> distinctCount() {
+    return Optional.empty();
+  }
+
+  default Optional<Object> min() {
+    return Optional.empty();
+  }
+
+  default Optional<Object> max() {
+    return Optional.empty();
+  }
+
+  default Optional<BigInteger> nullCount() {
+    return Optional.empty();
+  }
+
+  default OptionalLong avgLen() {
+    return OptionalLong.empty();
+  }
+
+  default OptionalLong maxLen() {
+    return OptionalLong.empty();
+  }
+
+  default Optional<Histogram> histogram() {
+    return Optional.empty();
+  }
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/Histogram.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/Histogram.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.colstats;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * An interface to represent an equi-height histogram, which is a part of
+ * {@link ColumnStatistics}. Equi-height histogram represents the distribution of
+ * a column's values by a sequence of bins.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface Histogram {
+  double height();
+  HistogramBin[] bins();
+}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/HistogramBin.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/colstats/HistogramBin.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read.colstats;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * An interface to represent a bin in an equi-height histogram.
+ *
+ * @since 3.4.0
+ */
+@Evolving
+public interface HistogramBin {
+  double lo();
+  double hi();
+  long ndv();
+}
diff --git a/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/...t/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.execution.datasources.v2
 
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, SortOrder}
-import org.apache.spark.sql.catalyst.plans.logical.{ExposesMetadataColumns, LeafNode, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, ExposesMetadataColumns, Histogram, HistogramBin, LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.util.{truncatedString, CharVarcharUtils}
 import org.apache.spark.sql.connector.catalog.{CatalogPlugin, FunctionCatalog, Identifier, MetadataColumn, SupportsMetadataColumns, Table, TableCapability}
-import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.{Scan, SupportsReportStatistics}
+import org.apache.spark.sql.connector.read.{Statistics => V2Statistics}
 import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.Utils
@@ -91,7 +92,7 @@ case class DataSourceV2Relation(
       table.asReadable.newScanBuilder(options).build() match {
         case r: SupportsReportStatistics =>
           val statistics = r.estimateStatistics()
-          DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+          DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
         case _ =>
           Statistics(sizeInBytes = conf.defaultSizeInBytes)
       }
@@ -142,7 +143,7 @@ case class DataSourceV2ScanRelation(
     scan match {
       case r: SupportsReportStatistics =>
         val statistics = r.estimateStatistics()
-        DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+        DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
       case _ =>
         Statistics(sizeInBytes = conf.defaultSizeInBytes)
     }
@@ -173,7 +174,7 @@ case class StreamingDataSourceV2Relation(
   override def computeStats(): Statistics = scan match {
     case r: SupportsReportStatistics =>
       val statistics = r.estimateStatistics()
-      DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes)
+      DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes, output)
     case _ =>
       Statistics(sizeInBytes = conf.defaultSizeInBytes)
   }
@@ -214,14 +215,52 @@ object DataSourceV2Relation {
   def transformV2Stats(
       v2Statistics: V2Statistics,
       defaultRowCount: Option[BigInt],
-      defaultSizeInBytes: Long): Statistics = {
+      defaultSizeInBytes: Long,
+      output: Seq[Attribute] = Seq.empty): Statistics = {
     val numRows: Option[BigInt] = if (v2Statistics.numRows().isPresent) {
       Some(v2Statistics.numRows().getAsLong)
     } else {
       defaultRowCount
     }
+
+    var colStats: Seq[(Attribute, ColumnStat)] = Seq.empty[(Attribute, ColumnStat)]
+    if (v2Statistics.columnStats().isPresent) {
+      val v2ColumnStat = v2Statistics.columnStats().get()
+      val keys = v2ColumnStat.keySet()
+
+      keys.forEach(key => {
+        val colStat = v2ColumnStat.get(key)
+        val distinct: Option[BigInt] =
+          if (colStat.distinctCount().isPresent) Some(colStat.distinctCount().get) else None
+        val min: Option[Any] = if (colStat.min().isPresent) Some(colStat.min().get) else None
+        val max: Option[Any] = if (colStat.max().isPresent) Some(colStat.max().get) else None
+        val nullCount: Option[BigInt] =
+          if (colStat.nullCount().isPresent) Some(colStat.nullCount().get()) else None
+        val avgLen: Option[Long] =
+          if (colStat.avgLen().isPresent) Some(colStat.avgLen().getAsLong) else None
+        val maxLen: Option[Long] =
+          if (colStat.maxLen().isPresent) Some(colStat.maxLen().getAsLong) else None
+        val histogram = if (colStat.histogram().isPresent) {
+          val v2Histogram = colStat.histogram().get()
+          val bins = v2Histogram.bins()
+          Some(Histogram(v2Histogram.height(),
+            bins.map(bin => HistogramBin(bin.lo, bin.hi, bin.ndv))))
+        } else {
+          None
+        }
+
+        val catalystColStat = ColumnStat(distinct, min, max, nullCount, avgLen, maxLen, histogram)
+
+        output.foreach(attribute => {
+          if (attribute.name.equals(key.describe())) {
+            colStats = colStats :+ (attribute -> catalystColStat)
+          }
+        })
+      })
+    }
     Statistics(
       sizeInBytes = v2Statistics.sizeInBytes().orElse(defaultSizeInBytes),
-      rowCount = numRows)
+      rowCount = numRows,
+      attributeStats = AttributeMap(colStats))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryBaseTable.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.sql.connector.catalog
 
+import java.math.BigInteger
 import java.time.{Instant, ZoneId}
 import java.time.temporal.ChronoUnit
 import java.util
-import java.util.OptionalLong
+import java.util.{HashMap, Optional, OptionalLong}
 
 import scala.collection.mutable
 
@@ -33,6 +34,7 @@ import org.apache.spark.sql.connector.distributions.{Distribution, Distributions
 import org.apache.spark.sql.connector.expressions._
 import org.apache.spark.sql.connector.metric.{CustomMetric, CustomTaskMetric}
 import org.apache.spark.sql.connector.read._
+import org.apache.spark.sql.connector.read.colstats.{ColumnStatistics, Histogram, HistogramBin}
 import org.apache.spark.sql.connector.read.partitioning.{KeyGroupedPartitioning, Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.connector.write._
 import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite}
@@ -273,7 +275,24 @@ abstract class InMemoryBaseTable(
     }
   }
 
-  case class InMemoryStats(sizeInBytes: OptionalLong, numRows: OptionalLong) extends Statistics
+  case class InMemoryStats(
+      sizeInBytes: OptionalLong,
+      numRows: OptionalLong,
+      override val columnStats: Optional[HashMap[NamedReference, ColumnStatistics]])
+    extends Statistics
+
+  case class InMemoryColumnStats (
+      override val distinctCount: Optional[BigInteger],
+      override val min: Optional[AnyRef],
+      override val max: Optional[AnyRef],
+      override val nullCount: Optional[BigInteger],
+      override val avgLen: OptionalLong,
+      override val maxLen: OptionalLong,
+      override val histogram: Optional[Histogram]) extends ColumnStatistics
+
+  case class InMemoryHistogramBin(lo: Double, hi: Double, ndv: Long) extends HistogramBin
+
+  case class InMemoryHistogram(height: Double, bins: Array[HistogramBin]) extends Histogram
 
   abstract class BatchScanBaseClass(
       var data: Seq[InputPartition],
@@ -285,7 +304,7 @@ abstract class InMemoryBaseTable(
 
     override def estimateStatistics(): Statistics = {
       if (data.isEmpty) {
-        return InMemoryStats(OptionalLong.of(0L), OptionalLong.of(0L))
+        return InMemoryStats(OptionalLong.of(0L), OptionalLong.of(0L), Optional.empty())
       }
 
       val inputPartitions = data.map(_.asInstanceOf[BufferedRows])
@@ -294,7 +313,30 @@ abstract class InMemoryBaseTable(
       val objectHeaderSizeInBytes = 12L
       val rowSizeInBytes = objectHeaderSizeInBytes + schema.defaultSize
       val sizeInBytes = numRows * rowSizeInBytes
-      InMemoryStats(OptionalLong.of(sizeInBytes), OptionalLong.of(numRows))
+
+      val map = new util.HashMap[NamedReference, ColumnStatistics]()
+      val colNames = readSchema.fields.map(_.name)
+      for (col <- colNames) {
+        val fieldReference = FieldReference(col)
+        // put some fake data for testing only
+        val bin1 = InMemoryHistogramBin(1, 2, 5L)
+        val bin2 = InMemoryHistogramBin(3, 4, 5L)
+        val bin3 = InMemoryHistogramBin(5, 6, 5L)
+        val bin4 = InMemoryHistogramBin(7, 8, 5L)
+        val bin5 = InMemoryHistogramBin(9, 10, 5L)
+        val colStats = InMemoryColumnStats(
+          Optional.of[BigInteger](BigInteger.valueOf(5)),
+          Optional.of[AnyRef](Integer.valueOf(0)),
+          Optional.of[AnyRef](Integer.valueOf(5)),
+          Optional.of[BigInteger](BigInteger.valueOf(0)),
+          OptionalLong.of(111L),
+          OptionalLong.of(1111L),
+          Optional.of[Histogram](InMemoryHistogram(5, Array(bin1, bin2, bin3, bin4, bin5)))
+        )
+        map.put(fieldReference, colStats)
+      }
+
+      InMemoryStats(OptionalLong.of(sizeInBytes), OptionalLong.of(numRows), Optional.of(map))
     }
 
     override def outputPartitioning(): Partitioning = {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAM
 import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership
 import org.apache.spark.sql.errors.QueryErrorsBase
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG_IMPLEMENTATION}
@@ -2772,6 +2773,39 @@ class DataSourceV2SQLSuiteV1Filter
     }
   }
 
+  test("SPARK-41378: test column stats") {
+    spark.sql("CREATE TABLE testcat.test (id bigint NOT NULL, data string)")
+    spark.sql("INSERT INTO testcat.test values (1, 'test1'), (2, 'test2'), (3, 'test3')," +
+      " (4, 'test4'), (5, 'test5')")
+    val df = spark.sql("select * from testcat.test")
+
+    df.queryExecution.optimizedPlan.collect {
+      case scan: DataSourceV2ScanRelation =>
+        val stats = scan.stats
+        assert(stats.sizeInBytes == 200)
+        assert(stats.rowCount.get == 5)
+        val colStats = stats.attributeStats.values.toArray
+        assert(colStats.length == 2)
+        colStats.foreach(stat => {
+          assert(stat.distinctCount.get == 5)
+          assert(stat.min.get == 0)
+          assert(stat.max.get == 5)
+          assert(stat.nullCount.get == 0)
+          assert(stat.avgLen.get == 111L)
+          assert(stat.maxLen.get == 1111L)
+          assert(stat.histogram.get.height == 5)
+          val bins = stat.histogram.get.bins
+          var i = 0
+          bins.foreach(bin => {
+            assert(bin.hi == 2.0 + 2 * i)
+            assert(bin.lo == 1.0 + 2 * i)
+            assert(bin.ndv == 5)
+            i += 1
+          })
+        })
+    }
+  }
+
   private def testNotSupportedV2Command(sqlCommand: String, sqlParams: String): Unit = {
     checkError(
       exception = intercept[AnalysisException] {