trinodb · ebyhr · Oct 11, 2022 · Oct 4, 2022 · Oct 4, 2022 · Sep 27, 2022
diff --git a/...c/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeParquetStatisticsUtils.java b/...c/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeParquetStatisticsUtils.java
@@ -13,10 +13,13 @@
  */
 package io.trino.plugin.deltalake.transactionlog;
 
+import com.google.common.collect.ImmutableMap;
 import io.airlift.log.Logger;
 import io.airlift.slice.Slice;
 import io.trino.plugin.base.type.DecodedTimestamp;
+import io.trino.spi.block.Block;
 import io.trino.spi.block.BlockBuilder;
+import io.trino.spi.block.ColumnarRow;
 import io.trino.spi.block.RowBlockBuilder;
 import io.trino.spi.type.ArrayType;
 import io.trino.spi.type.DateType;
@@ -56,6 +59,7 @@
 import static com.google.common.collect.ImmutableMap.toImmutableMap;
 import static io.airlift.slice.Slices.utf8Slice;
 import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp;
+import static io.trino.spi.block.ColumnarRow.toColumnarRow;
 import static io.trino.spi.type.BigintType.BIGINT;
 import static io.trino.spi.type.BooleanType.BOOLEAN;
 import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc;
@@ -67,6 +71,7 @@
 import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
 import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND;
 import static io.trino.spi.type.TinyintType.TINYINT;
+import static io.trino.spi.type.TypeUtils.readNativeValue;
 import static io.trino.spi.type.TypeUtils.writeNativeValue;
 import static java.lang.Float.floatToRawIntBits;
 import static java.lang.Float.intBitsToFloat;
@@ -76,6 +81,7 @@
 import static java.time.format.DateTimeFormatter.ISO_INSTANT;
 import static java.time.format.DateTimeFormatter.ISO_LOCAL_DATE;
 import static java.time.temporal.ChronoUnit.MILLIS;
+import static java.util.Objects.requireNonNull;
 
 public class DeltaLakeParquetStatisticsUtils
 {
@@ -154,8 +160,7 @@ public static Map<String, Object> toJsonValues(Map<String, Type> columnTypeMappi
         Map<String, Object> jsonValues = new HashMap<>();
         for (Map.Entry<String, Object> value : values.entrySet()) {
             Type type = columnTypeMapping.get(value.getKey());
-            // TODO: Add support for row type
-            if (type instanceof ArrayType || type instanceof MapType || type instanceof RowType) {
+            if (type instanceof ArrayType || type instanceof MapType) {
                 continue;
             }
             jsonValues.put(value.getKey(), toJsonValue(columnTypeMapping.get(value.getKey()), value.getValue()));
@@ -197,6 +202,19 @@ private static Object toJsonValue(Type type, @Nullable Object value)
             Instant ts = Instant.ofEpochMilli(unpackMillisUtc((long) value));
             return ISO_INSTANT.format(ZonedDateTime.ofInstant(ts, UTC));
         }
+        if (type instanceof RowType rowType) {
+            Block rowBlock = (Block) value;
+            ImmutableMap.Builder<String, Object> fieldValues = ImmutableMap.builder();
+            for (int i = 0; i < rowBlock.getPositionCount(); i++) {
+                RowType.Field field = rowType.getFields().get(i);
+                Object fieldValue = readNativeValue(field.getType(), rowBlock.getChildren().get(i), i);
+                Object jsonValue = toJsonValue(field.getType(), fieldValue);
+                if (jsonValue != null) {
+                    fieldValues.put(field.getName().orElseThrow(), jsonValue);
+                }
+            }
+            return fieldValues.buildOrThrow();
+        }
 
         throw new UnsupportedOperationException("Unsupported type: " + type);
     }
@@ -222,6 +240,36 @@ private static Map<String, Object> jsonEncode(Map<String, Optional<Statistics<?>
                 .collect(toImmutableMap(Map.Entry::getKey, entry -> entry.getValue().get()));
     }
 
+    public static Map<String, Object> toNullCounts(Map<String, Type> columnTypeMapping, Map<String, Object> values)
+    {
+        ImmutableMap.Builder<String, Object> nullCounts = ImmutableMap.builderWithExpectedSize(values.size());
+        for (Map.Entry<String, Object> value : values.entrySet()) {
+            Type type = columnTypeMapping.get(value.getKey());
+            requireNonNull(type, "type is null");
+            nullCounts.put(value.getKey(), toNullCount(type, value.getValue()));
+        }
+        return nullCounts.buildOrThrow();
+    }
+
+    private static Object toNullCount(Type type, Object value)
+    {
+        if (type instanceof RowType rowType) {
+            ColumnarRow row = toColumnarRow((Block) value);
+            ImmutableMap.Builder<String, Object> nullCounts = ImmutableMap.builderWithExpectedSize(row.getPositionCount());
+            for (int i = 0; i < row.getPositionCount(); i++) {
+                RowType.Field field = rowType.getFields().get(i);
+                if (field.getType() instanceof RowType) {
+                    nullCounts.put(field.getName().orElseThrow(), toNullCount(field.getType(), row.getField(i)));
+                }
+                else {
+                    nullCounts.put(field.getName().orElseThrow(), BIGINT.getLong(row.getField(i), 0));
+                }
+            }
+            return nullCounts.buildOrThrow();
+        }
+        return value;
+    }
+
     private static Optional<Object> getMin(Type type, Statistics<?> statistics)
     {
         if (statistics.genericGetMin() == null || !statistics.hasNonNullValue()) {

diff --git a/...lta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java b/...lta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogAccess.java
@@ -220,7 +220,7 @@ public List<AddFileEntry> getActiveFiles(TableSnapshot tableSnapshot, ConnectorS
                 }
                 catch (MissingTransactionLogException e) {
                     // Reset the cached table when there are transaction files which are newer than
-                    // the cached table version which are already garbage colllected.
+                    // the cached table version which are already garbage collected.
                     List<AddFileEntry> activeFiles = loadActiveFiles(tableSnapshot, session);
                     updatedCacheEntry = new DeltaLakeDataFileCacheEntry(tableSnapshot.getVersion(), activeFiles);
                 }

diff --git a/...e/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriter.java b/...e/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointWriter.java
@@ -58,6 +58,7 @@
 import static io.trino.plugin.deltalake.DeltaLakeSchemaProperties.buildHiveSchema;
 import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.jsonValueToTrinoValue;
 import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.toJsonValues;
+import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.toNullCounts;
 import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema;
 import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.serializeStatsAsJson;
 import static io.trino.plugin.deltalake.transactionlog.MetadataEntry.DELTA_CHECKPOINT_WRITE_STATS_AS_JSON_PROPERTY;
@@ -244,7 +245,7 @@ private void writeJsonStats(BlockBuilder entryBlockBuilder, RowType entryType, A
                         parquetFileStatistics.getNumRecords(),
                         parquetFileStatistics.getMinValues().map(values -> toJsonValues(columnTypeMapping, values)),
                         parquetFileStatistics.getMaxValues().map(values -> toJsonValues(columnTypeMapping, values)),
-                        parquetFileStatistics.getNullCount());
+                        parquetFileStatistics.getNullCount().map(nullCounts -> toNullCounts(columnTypeMapping, nullCounts)));
                 statsJson = getStatsString(jsonFileStatistics).orElse(null);
             }
             else {

diff --git a/...e/src/test/java/io/trino/plugin/deltalake/AbstractTestDeltaLakeCreateTableStatistics.java b/...e/src/test/java/io/trino/plugin/deltalake/AbstractTestDeltaLakeCreateTableStatistics.java
@@ -16,22 +16,14 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import io.trino.Session;
-import io.trino.filesystem.hdfs.HdfsFileSystemFactory;
 import io.trino.plugin.deltalake.transactionlog.AddFileEntry;
-import io.trino.plugin.deltalake.transactionlog.TransactionLogAccess;
-import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager;
 import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics;
-import io.trino.plugin.hive.FileFormatDataSourceStats;
 import io.trino.plugin.hive.containers.HiveMinioDataLake;
-import io.trino.plugin.hive.parquet.ParquetReaderConfig;
-import io.trino.spi.connector.SchemaTableName;
 import io.trino.spi.type.DateType;
 import io.trino.spi.type.DecimalType;
 import io.trino.spi.type.DoubleType;
 import io.trino.testing.AbstractTestQueryFramework;
 import io.trino.testing.QueryRunner;
-import io.trino.testing.TestingConnectorContext;
-import org.apache.hadoop.fs.Path;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
@@ -50,14 +42,12 @@
 import static io.airlift.slice.Slices.utf8Slice;
 import static io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR;
 import static io.trino.plugin.deltalake.DeltaLakeQueryRunner.DELTA_CATALOG;
-import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT;
 import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone;
 import static io.trino.spi.type.Decimals.MAX_SHORT_PRECISION;
 import static io.trino.spi.type.Decimals.encodeScaledValue;
 import static io.trino.spi.type.TimeZoneKey.UTC_KEY;
 import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
 import static io.trino.spi.type.VarcharType.createUnboundedVarcharType;
-import static io.trino.testing.TestingConnectorSession.SESSION;
 import static io.trino.testing.sql.TestTable.randomTableSuffix;
 import static java.lang.Double.NEGATIVE_INFINITY;
 import static java.lang.Double.POSITIVE_INFINITY;
@@ -497,17 +487,6 @@ public void close()
     protected List<AddFileEntry> getAddFileEntries(String tableName)
             throws IOException
     {
-        TestingConnectorContext context = new TestingConnectorContext();
-
-        TransactionLogAccess transactionLogAccess = new TransactionLogAccess(
-                context.getTypeManager(),
-                new CheckpointSchemaManager(context.getTypeManager()),
-                new DeltaLakeConfig(),
-                new FileFormatDataSourceStats(),
-                new HdfsFileSystemFactory(HDFS_ENVIRONMENT),
-                new ParquetReaderConfig());
-
-        return transactionLogAccess.getActiveFiles(
-                transactionLogAccess.loadSnapshot(new SchemaTableName(SCHEMA, tableName), new Path(format("s3://%s/%s", bucketName, tableName)), SESSION), SESSION);
+        return TestingDeltaLakeUtils.getAddFileEntries(format("s3://%s/%s", bucketName, tableName));
     }
 }
diff --git a/...o-delta-lake/src/test/java/io/trino/plugin/deltalake/BaseDeltaLakeConnectorSmokeTest.java b/...o-delta-lake/src/test/java/io/trino/plugin/deltalake/BaseDeltaLakeConnectorSmokeTest.java
@@ -22,6 +22,7 @@
 import io.trino.Session;
 import io.trino.execution.QueryManager;
 import io.trino.operator.OperatorStats;
+import io.trino.plugin.deltalake.transactionlog.AddFileEntry;
 import io.trino.plugin.hive.TestingHivePlugin;
 import io.trino.plugin.hive.containers.HiveHadoop;
 import io.trino.plugin.hive.containers.HiveMinioDataLake;
@@ -40,11 +41,13 @@
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
+import java.io.IOException;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.function.BiConsumer;
 
+import static com.google.common.collect.ImmutableList.toImmutableList;
 import static com.google.common.collect.ImmutableSet.toImmutableSet;
 import static com.google.common.collect.MoreCollectors.onlyElement;
 import static com.google.common.collect.Sets.union;
@@ -65,6 +68,7 @@
 import static io.trino.tpch.TpchTable.LINE_ITEM;
 import static io.trino.tpch.TpchTable.ORDERS;
 import static java.lang.String.format;
+import static java.util.Comparator.comparing;
 import static java.util.concurrent.TimeUnit.MILLISECONDS;
 import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.assertj.core.api.Assertions.assertThat;
@@ -93,6 +97,7 @@ public abstract class BaseDeltaLakeConnectorSmokeTest
             "old_timestamps",
             "nested_timestamps",
             "nested_timestamps_parquet_stats",
+            "json_stats_on_row_type",
             "parquet_stats_missing",
             "uppercase_columns",
             "default_partitions",
@@ -831,6 +836,59 @@ public void testSelectNestedTimestamps()
         assertQuery("SELECT CAST(col1[1].ts AS VARCHAR) FROM nested_timestamps_parquet_stats LIMIT 1", "VALUES '2010-02-03 12:11:10.000 UTC'");
     }
 
+    @Test
+    public void testConvertJsonStatisticsToParquetOnRowType()
+            throws Exception
+    {
+        verifySupportsInsert();
+
+        assertQuery("SELECT count(*) FROM json_stats_on_row_type", "VALUES 2");
+        String transactionLogDirectory = "json_stats_on_row_type/_delta_log";
+        String newTransactionFile = getLocationForTable(bucketName, "json_stats_on_row_type") + "/_delta_log/00000000000000000004.json";
+        String newCheckpointFile = getLocationForTable(bucketName, "json_stats_on_row_type") + "/_delta_log/00000000000000000004.checkpoint.parquet";
+        assertThat(getTableFiles(transactionLogDirectory))
+                .doesNotContain(newTransactionFile, newCheckpointFile);
+
+        assertUpdate("INSERT INTO json_stats_on_row_type SELECT CAST(row(3) AS row(x bigint)), CAST(row(row('test insert')) AS row(y row(nested varchar)))", 1);
+        assertThat(getTableFiles(transactionLogDirectory))
+                .contains(newTransactionFile, newCheckpointFile);
+        assertThat(getAddFileEntries("json_stats_on_row_type")).hasSize(3);
+
+        // The first two entries created by Databricks have column stats. The last one doesn't have column stats because the connector doesn't support collecting it on row columns.
+        List<AddFileEntry> addFileEntries = getAddFileEntries("json_stats_on_row_type").stream().sorted(comparing(AddFileEntry::getModificationTime)).collect(toImmutableList());
+        assertThat(addFileEntries).hasSize(3);
+        assertJsonStatistics(
+                addFileEntries.get(0),
+                "{" +
+                        "\"numRecords\":1," +
+                        "\"minValues\":{\"nested_struct_col\":{\"y\":{\"nested\":\"test\"}},\"struct_col\":{\"x\":1}}," +
+                        "\"maxValues\":{\"nested_struct_col\":{\"y\":{\"nested\":\"test\"}},\"struct_col\":{\"x\":1}}," +
+                        "\"nullCount\":{\"struct_col\":{\"x\":0},\"nested_struct_col\":{\"y\":{\"nested\":0}}}" +
+                        "}");
+        assertJsonStatistics(
+                addFileEntries.get(1),
+                "{" +
+                        "\"numRecords\":1," +
+                        "\"minValues\":{\"nested_struct_col\":{\"y\":{}},\"struct_col\":{}}," +
+                        "\"maxValues\":{\"nested_struct_col\":{\"y\":{}},\"struct_col\":{}}," +
+                        "\"nullCount\":{\"struct_col\":{\"x\":1},\"nested_struct_col\":{\"y\":{\"nested\":1}}}" +
+                        "}");
+        assertJsonStatistics(
+                addFileEntries.get(2),
+                "{\"numRecords\":1,\"minValues\":{},\"maxValues\":{},\"nullCount\":{}}");
+    }
+
+    private List<AddFileEntry> getAddFileEntries(String tableName)
+            throws IOException
+    {
+        return TestingDeltaLakeUtils.getAddFileEntries(getLocationForTable(bucketName, tableName));
+    }
+
+    private void assertJsonStatistics(AddFileEntry addFileEntry, @Language("JSON") String jsonStatistics)
+    {
+        assertEquals(addFileEntry.getStatsString().orElseThrow(), jsonStatistics);
+    }
+
     @Test
     public void testMissingParquetStats()
     {

diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestingDeltaLakeUtils.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.plugin.deltalake;
+
+import io.trino.filesystem.hdfs.HdfsFileSystemFactory;
+import io.trino.plugin.deltalake.transactionlog.AddFileEntry;
+import io.trino.plugin.deltalake.transactionlog.TransactionLogAccess;
+import io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager;
+import io.trino.plugin.hive.FileFormatDataSourceStats;
+import io.trino.plugin.hive.parquet.ParquetReaderConfig;
+import io.trino.spi.connector.SchemaTableName;
+import io.trino.testing.TestingConnectorContext;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.util.List;
+
+import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT;
+import static io.trino.testing.TestingConnectorSession.SESSION;
+
+public final class TestingDeltaLakeUtils
+{
+    private TestingDeltaLakeUtils() {}
+
+    public static List<AddFileEntry> getAddFileEntries(String tableLocation)
+            throws IOException
+    {
+        SchemaTableName dummyTable = new SchemaTableName("dummy_schema_placeholder", "dummy_table_placeholder");
+        TestingConnectorContext context = new TestingConnectorContext();
+
+        TransactionLogAccess transactionLogAccess = new TransactionLogAccess(
+                context.getTypeManager(),
+                new CheckpointSchemaManager(context.getTypeManager()),
+                new DeltaLakeConfig(),
+                new FileFormatDataSourceStats(),
+                new HdfsFileSystemFactory(HDFS_ENVIRONMENT),
+                new ParquetReaderConfig());
+
+        return transactionLogAccess.getActiveFiles(transactionLogAccess.loadSnapshot(dummyTable, new Path(tableLocation), SESSION), SESSION);
+    }
+}
diff --git a/...trino-delta-lake/src/test/resources/databricks/json_stats_on_row_type/README.md b/...trino-delta-lake/src/test/resources/databricks/json_stats_on_row_type/README.md
@@ -0,0 +1,21 @@
+Data generated using Databricks 10.4:
+
+```sql
+CREATE TABLE default.json_stats_on_row_type
+ (struct_col struct<x bigint>, nested_struct_col struct<y struct<nested string>>)
+USING DELTA 
+LOCATION 's3://bucket/table' 
+TBLPROPERTIES (
+ delta.checkpointInterval = 2,
+ delta.checkpoint.writeStatsAsJson = false,
+ delta.checkpoint.writeStatsAsStruct = true
+);
+
+INSERT INTO default.json_stats_on_row_type SELECT named_struct('x', 1), named_struct('y', named_struct('nested', 'test'));
+INSERT INTO default.json_stats_on_row_type SELECT named_struct('x', NULL), named_struct('y', named_struct('nested', NULL));
+
+ALTER TABLE default.json_stats_on_row_type SET TBLPROPERTIES (
+ 'delta.checkpoint.writeStatsAsJson' = true,
+ 'delta.checkpoint.writeStatsAsStruct' = false
+);
+```
diff --git a/...rces/databricks/json_stats_on_row_type/_delta_log/00000000000000000002.checkpoint.parquet b/...rces/databricks/json_stats_on_row_type/_delta_log/00000000000000000002.checkpoint.parquet
diff --git a/...src/test/resources/databricks/json_stats_on_row_type/_delta_log/00000000000000000002.json b/...src/test/resources/databricks/json_stats_on_row_type/_delta_log/00000000000000000002.json
@@ -0,0 +1,2 @@
+{"add":{"path":"part-00000-df481541-fe59-4af2-a37f-68a39a1e2a5d-c000.snappy.parquet","partitionValues":{},"size":1074,"modificationTime":1664924826000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"struct_col\":{},\"nested_struct_col\":{\"y\":{}}},\"maxValues\":{\"struct_col\":{},\"nested_struct_col\":{\"y\":{}}},\"nullCount\":{\"struct_col\":{\"x\":1},\"nested_struct_col\":{\"y\":{\"nested\":1}}}}","tags":{"INSERTION_TIME":"1664924826000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
+{"commitInfo":{"timestamp":1664924826032,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"2299734316069194"},"clusterId":"0620-043712-o6vqr39c","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1074"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"974a8474-26b8-42fc-a2c6-4d6af29109a2"}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"add":{"path":"part-00000-df481541-fe59-4af2-a37f-68a39a1e2a5d-c000.snappy.parquet","partitionValues":{},"size":1074,"modificationTime":1664924826000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"struct_col\":{},\"nested_struct_col\":{\"y\":{}}},\"maxValues\":{\"struct_col\":{},\"nested_struct_col\":{\"y\":{}}},\"nullCount\":{\"struct_col\":{\"x\":1},\"nested_struct_col\":{\"y\":{\"nested\":1}}}}","tags":{"INSERTION_TIME":"1664924826000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
		{"commitInfo":{"timestamp":1664924826032,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"2299734316069194"},"clusterId":"0620-043712-o6vqr39c","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1074"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"974a8474-26b8-42fc-a2c6-4d6af29109a2"}}