diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java b/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java
index 2f3bdb3a87bf..21247b772d94 100644
--- a/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java
+++ b/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java
@@ -34,6 +34,7 @@
import static io.trino.metastore.type.TypeConstants.TIMESTAMPLOCALTZ_TYPE_NAME;
import static io.trino.metastore.type.TypeConstants.TIMESTAMP_TYPE_NAME;
import static io.trino.metastore.type.TypeConstants.TINYINT_TYPE_NAME;
+import static io.trino.metastore.type.TypeConstants.VARIANT_TYPE_NAME;
import static io.trino.metastore.type.TypeInfoFactory.getPrimitiveTypeInfo;
import static io.trino.metastore.type.TypeInfoUtils.getTypeInfoFromTypeString;
import static io.trino.metastore.type.TypeInfoUtils.getTypeInfosFromTypeString;
@@ -55,6 +56,7 @@ public final class HiveType
public static final HiveType HIVE_TIMESTAMPLOCALTZ = new HiveType(getPrimitiveTypeInfo(TIMESTAMPLOCALTZ_TYPE_NAME));
public static final HiveType HIVE_DATE = new HiveType(getPrimitiveTypeInfo(DATE_TYPE_NAME));
public static final HiveType HIVE_BINARY = new HiveType(getPrimitiveTypeInfo(BINARY_TYPE_NAME));
+ public static final HiveType HIVE_VARIANT = new HiveType(getPrimitiveTypeInfo(VARIANT_TYPE_NAME));
private final HiveTypeName hiveTypeName;
private final TypeInfo typeInfo;
diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java
index f0336cff7272..dac44e8c5592 100644
--- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java
+++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java
@@ -18,5 +18,5 @@ public enum PrimitiveCategory
{
VOID, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING,
DATE, TIMESTAMP, TIMESTAMPLOCALTZ, BINARY, DECIMAL, VARCHAR, CHAR,
- INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, UNKNOWN
+ INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VARIANT, UNKNOWN
}
diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java
index df41b965ce14..147e9f1f54a4 100644
--- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java
+++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java
@@ -35,6 +35,7 @@ private TypeConstants() {}
public static final String BINARY_TYPE_NAME = "binary";
public static final String INTERVAL_YEAR_MONTH_TYPE_NAME = "interval_year_month";
public static final String INTERVAL_DAY_TIME_TYPE_NAME = "interval_day_time";
+ public static final String VARIANT_TYPE_NAME = "variant";
public static final String LIST_TYPE_NAME = "array";
public static final String MAP_TYPE_NAME = "map";
diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java
index dc1a94fc910a..bb351d433650 100644
--- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java
+++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java
@@ -61,6 +61,7 @@ private TypeInfoUtils() {}
registerType(new PrimitiveTypeEntry(PrimitiveCategory.INTERVAL_YEAR_MONTH, TypeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME));
registerType(new PrimitiveTypeEntry(PrimitiveCategory.INTERVAL_DAY_TIME, TypeConstants.INTERVAL_DAY_TIME_TYPE_NAME));
registerType(new PrimitiveTypeEntry(PrimitiveCategory.DECIMAL, TypeConstants.DECIMAL_TYPE_NAME));
+ registerType(new PrimitiveTypeEntry(PrimitiveCategory.VARIANT, TypeConstants.VARIANT_TYPE_NAME));
registerType(new PrimitiveTypeEntry(PrimitiveCategory.UNKNOWN, "unknown"));
}
diff --git a/lib/trino-parquet/pom.xml b/lib/trino-parquet/pom.xml
index 9cecbda33c17..f53194af3944 100644
--- a/lib/trino-parquet/pom.xml
+++ b/lib/trino-parquet/pom.xml
@@ -13,6 +13,11 @@
Trino - Parquet file format support
+
+ com.fasterxml.jackson.core
+ jackson-core
+
+
com.google.errorprone
error_prone_annotations
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java b/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java
index 34980fd6af82..85ce8a555875 100644
--- a/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java
+++ b/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java
@@ -39,8 +39,11 @@
import java.util.Map;
import java.util.Optional;
+import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
+import static io.trino.spi.type.StandardTypes.JSON;
+import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static java.lang.String.format;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
@@ -291,6 +294,15 @@ private static Optional constructField(Type type, ColumnIO columnIO, bool
boolean required = columnIO.getType().getRepetition() != OPTIONAL;
int repetitionLevel = columnIO.getRepetitionLevel();
int definitionLevel = columnIO.getDefinitionLevel();
+ if (isVariantType(type, columnIO)) {
+ checkArgument(type.getTypeParameters().isEmpty(), "Expected type parameters to be empty for variant but got %s", type.getTypeParameters());
+ if (!(columnIO instanceof GroupColumnIO groupColumnIo)) {
+ throw new IllegalStateException("Expected columnIO to be GroupColumnIO but got %s".formatted(columnIO.getClass().getSimpleName()));
+ }
+ Field valueField = constructField(VARBINARY, groupColumnIo.getChild(0), false).orElseThrow();
+ Field metadataField = constructField(VARBINARY, groupColumnIo.getChild(1), false).orElseThrow();
+ return Optional.of(new VariantField(type, repetitionLevel, definitionLevel, required, valueField, metadataField));
+ }
if (type instanceof RowType rowType) {
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
ImmutableList.Builder> fieldsBuilder = ImmutableList.builder();
@@ -350,4 +362,13 @@ private static Optional constructField(Type type, ColumnIO columnIO, bool
}
return Optional.of(new PrimitiveField(type, required, primitiveColumnIO.getColumnDescriptor(), primitiveColumnIO.getId()));
}
+
+ private static boolean isVariantType(Type type, ColumnIO columnIO)
+ {
+ return type.getTypeSignature().getBase().equals(JSON) &&
+ columnIO instanceof GroupColumnIO groupColumnIo &&
+ groupColumnIo.getChildrenCount() == 2 &&
+ groupColumnIo.getChild("value") != null &&
+ groupColumnIo.getChild("metadata") != null;
+ }
}
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java b/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java
new file mode 100644
index 000000000000..f9b65baf4ca9
--- /dev/null
+++ b/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.parquet;
+
+import io.trino.spi.type.Type;
+
+import static com.google.common.base.MoreObjects.toStringHelper;
+import static java.util.Objects.requireNonNull;
+
+public class VariantField
+ extends Field
+{
+ private final Field value;
+ private final Field metadata;
+
+ public VariantField(Type type, int repetitionLevel, int definitionLevel, boolean required, Field value, Field metadata)
+ {
+ super(type, repetitionLevel, definitionLevel, required);
+ this.value = requireNonNull(value, "value is null");
+ this.metadata = requireNonNull(metadata, "metadata is null");
+ }
+
+ public Field getValue()
+ {
+ return value;
+ }
+
+ public Field getMetadata()
+ {
+ return metadata;
+ }
+
+ @Override
+ public String toString()
+ {
+ return toStringHelper(this)
+ .add("type", getType())
+ .add("repetitionLevel", getRepetitionLevel())
+ .add("definitionLevel", getDefinitionLevel())
+ .add("required", isRequired())
+ .add("value", value)
+ .add("metadata", getMetadata())
+ .toString();
+ }
+}
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java
index c77d013fdc35..5ba0976581bf 100644
--- a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java
+++ b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java
@@ -19,6 +19,7 @@
import com.google.common.collect.ListMultimap;
import com.google.errorprone.annotations.FormatMethod;
import io.airlift.log.Logger;
+import io.airlift.slice.Slice;
import io.trino.memory.context.AggregatedMemoryContext;
import io.trino.parquet.ChunkKey;
import io.trino.parquet.Column;
@@ -30,14 +31,17 @@
import io.trino.parquet.ParquetReaderOptions;
import io.trino.parquet.ParquetWriteValidation;
import io.trino.parquet.PrimitiveField;
+import io.trino.parquet.VariantField;
import io.trino.parquet.metadata.ColumnChunkMetadata;
import io.trino.parquet.metadata.PrunedBlockMetadata;
import io.trino.parquet.predicate.TupleDomainParquetPredicate;
import io.trino.parquet.reader.FilteredOffsetIndex.OffsetRange;
+import io.trino.parquet.spark.Variant;
import io.trino.plugin.base.metrics.LongCount;
import io.trino.spi.Page;
import io.trino.spi.block.ArrayBlock;
import io.trino.spi.block.Block;
+import io.trino.spi.block.BlockBuilder;
import io.trino.spi.block.DictionaryBlock;
import io.trino.spi.block.RowBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
@@ -59,6 +63,7 @@
import java.io.Closeable;
import java.io.IOException;
+import java.time.ZoneId;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -69,6 +74,7 @@
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
+import static io.airlift.slice.Slices.utf8Slice;
import static io.trino.parquet.ParquetValidationUtils.validateParquet;
import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation;
import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation.createStatisticsValidationBuilder;
@@ -76,6 +82,8 @@
import static io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder;
import static io.trino.parquet.reader.ListColumnReader.calculateCollectionOffsets;
import static io.trino.parquet.reader.PageReader.createPageReader;
+import static io.trino.spi.type.VarbinaryType.VARBINARY;
+import static io.trino.spi.type.VarcharType.VARCHAR;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;
@@ -97,6 +105,7 @@ public class ParquetReader
private final List columnFields;
private final List primitiveFields;
private final ParquetDataSource dataSource;
+ private final ZoneId zoneId;
private final ColumnReaderFactory columnReaderFactory;
private final AggregatedMemoryContext memoryContext;
@@ -149,6 +158,7 @@ public ParquetReader(
this.primitiveFields = getPrimitiveFields(columnFields.stream().map(Column::field).collect(toImmutableList()));
this.rowGroups = requireNonNull(rowGroups, "rowGroups is null");
this.dataSource = requireNonNull(dataSource, "dataSource is null");
+ this.zoneId = requireNonNull(timeZone, "timeZone is null").toTimeZone().toZoneId();
this.columnReaderFactory = new ColumnReaderFactory(timeZone, options);
this.memoryContext = requireNonNull(memoryContext, "memoryContext is null");
this.currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext();
@@ -332,6 +342,25 @@ private void freeCurrentRowGroupBuffers()
}
}
+ private ColumnChunk readVariant(VariantField field)
+ throws IOException
+ {
+ ColumnChunk valueChunk = readColumnChunk(field.getValue());
+
+ BlockBuilder variantBlock = VARCHAR.createBlockBuilder(null, 1);
+ if (valueChunk.getBlock().getPositionCount() == 0) {
+ variantBlock.appendNull();
+ }
+ else {
+ ColumnChunk metadataChunk = readColumnChunk(field.getMetadata());
+ Slice value = VARBINARY.getSlice(valueChunk.getBlock(), 0);
+ Slice metadata = VARBINARY.getSlice(metadataChunk.getBlock(), 0);
+ Variant variant = new Variant(value.byteArray(), metadata.byteArray());
+ VARCHAR.writeSlice(variantBlock, utf8Slice(variant.toJson(zoneId)));
+ }
+ return new ColumnChunk(variantBlock.build(), valueChunk.getDefinitionLevels(), valueChunk.getRepetitionLevels());
+ }
+
private ColumnChunk readArray(GroupField field)
throws IOException
{
@@ -523,6 +552,10 @@ else if (field instanceof GroupField groupField) {
.flatMap(Optional::stream)
.forEach(child -> parseField(child, primitiveFields));
}
+ else if (field instanceof VariantField variantField) {
+ parseField(variantField.getValue(), primitiveFields);
+ parseField(variantField.getMetadata(), primitiveFields);
+ }
}
public Block readBlock(Field field)
@@ -535,7 +568,10 @@ private ColumnChunk readColumnChunk(Field field)
throws IOException
{
ColumnChunk columnChunk;
- if (field.getType() instanceof RowType) {
+ if (field instanceof VariantField variantField) {
+ columnChunk = readVariant(variantField);
+ }
+ else if (field.getType() instanceof RowType) {
columnChunk = readStruct((GroupField) field);
}
else if (field.getType() instanceof MapType) {
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java
new file mode 100644
index 000000000000..12b7d6a69817
--- /dev/null
+++ b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.parquet.spark;
+
+import com.fasterxml.jackson.core.JsonGenerator;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatterBuilder;
+import java.util.Base64;
+import java.util.Locale;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static io.trino.parquet.spark.VariantUtil.SIZE_LIMIT;
+import static io.trino.parquet.spark.VariantUtil.VERSION;
+import static io.trino.parquet.spark.VariantUtil.VERSION_MASK;
+import static io.trino.parquet.spark.VariantUtil.getBinary;
+import static io.trino.parquet.spark.VariantUtil.getBoolean;
+import static io.trino.parquet.spark.VariantUtil.getDecimal;
+import static io.trino.parquet.spark.VariantUtil.getDouble;
+import static io.trino.parquet.spark.VariantUtil.getFloat;
+import static io.trino.parquet.spark.VariantUtil.getLong;
+import static io.trino.parquet.spark.VariantUtil.getMetadataKey;
+import static io.trino.parquet.spark.VariantUtil.getString;
+import static io.trino.parquet.spark.VariantUtil.getType;
+import static io.trino.parquet.spark.VariantUtil.handleArray;
+import static io.trino.parquet.spark.VariantUtil.handleObject;
+import static io.trino.parquet.spark.VariantUtil.readUnsigned;
+import static io.trino.plugin.base.util.JsonUtils.jsonFactory;
+import static java.time.format.DateTimeFormatter.ISO_LOCAL_DATE;
+import static java.time.format.DateTimeFormatter.ISO_LOCAL_TIME;
+import static java.time.temporal.ChronoUnit.MICROS;
+
+/**
+ * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java
+ * and adjusted the code style.
+ */
+public final class Variant
+{
+ private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder()
+ .append(ISO_LOCAL_DATE)
+ .appendLiteral(' ')
+ .append(ISO_LOCAL_TIME)
+ .toFormatter(Locale.US);
+
+ private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder()
+ .append(TIMESTAMP_NTZ_FORMATTER)
+ .appendOffset("+HH:MM", "+00:00")
+ .toFormatter(Locale.US);
+
+ private final byte[] value;
+ private final byte[] metadata;
+ // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and
+ // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary
+ // when reading a sub-variant in the array/object element.
+ private final int position;
+
+ public Variant(byte[] value, byte[] metadata)
+ {
+ this(value, metadata, 0);
+ }
+
+ private Variant(byte[] value, byte[] metadata, int position)
+ {
+ this.value = value;
+ this.metadata = metadata;
+ this.position = position;
+ checkArgument(metadata.length >= 1, "metadata must be present");
+ checkArgument((metadata[0] & VERSION_MASK) == VERSION, "metadata version must be %s", VERSION);
+ // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks memory instability.
+ checkArgument(metadata.length <= SIZE_LIMIT, "max metadata size is %s: %s", SIZE_LIMIT, metadata.length);
+ checkArgument(value.length <= SIZE_LIMIT, "max value size is %s: %s", SIZE_LIMIT, value.length);
+ }
+
+ // Stringify the variant in JSON format.
+ public String toJson(ZoneId zoneId)
+ {
+ StringBuilder json = new StringBuilder();
+ toJsonImpl(value, metadata, position, json, zoneId);
+ return json.toString();
+ }
+
+ private static void toJsonImpl(byte[] value, byte[] metadata, int position, StringBuilder json, ZoneId zoneId)
+ {
+ switch (getType(value, position)) {
+ case NULL -> json.append("null");
+ case BOOLEAN -> json.append(getBoolean(value, position));
+ case LONG -> json.append(getLong(value, position));
+ case FLOAT -> json.append(getFloat(value, position));
+ case DOUBLE -> json.append(getDouble(value, position));
+ case DECIMAL -> json.append(getDecimal(value, position).toPlainString());
+ case STRING -> json.append(escapeJson(getString(value, position)));
+ case BINARY -> appendQuoted(json, Base64.getEncoder().encodeToString(getBinary(value, position)));
+ case DATE -> appendQuoted(json, LocalDate.ofEpochDay(getLong(value, position)).toString());
+ case TIMESTAMP -> appendQuoted(json, TIMESTAMP_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(zoneId)));
+ case TIMESTAMP_NTZ -> appendQuoted(json, TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(ZoneOffset.UTC)));
+ case ARRAY -> handleArray(value, position, (size, offsetSize, offsetStart, dataStart) -> {
+ json.append('[');
+ for (int i = 0; i < size; ++i) {
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPos = dataStart + offset;
+ if (i != 0) {
+ json.append(',');
+ }
+ toJsonImpl(value, metadata, elementPos, json, zoneId);
+ }
+ json.append(']');
+ return null;
+ });
+ case OBJECT -> handleObject(value, position, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> {
+ json.append('{');
+ for (int i = 0; i < size; ++i) {
+ int id = readUnsigned(value, idStart + idSize * i, idSize);
+ int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize);
+ int elementPosition = dataStart + offset;
+ if (i != 0) {
+ json.append(',');
+ }
+ json.append(escapeJson(getMetadataKey(metadata, id)));
+ json.append(':');
+ toJsonImpl(value, metadata, elementPosition, json, zoneId);
+ }
+ json.append('}');
+ return null;
+ });
+ }
+ }
+
+ private static Instant microsToInstant(long timestamp)
+ {
+ return Instant.EPOCH.plus(timestamp, MICROS);
+ }
+
+ // A simplified and more performant version of `sb.append(escapeJson(value))`. It is used when we
+ // know `value` doesn't contain any special character that needs escaping.
+ private static void appendQuoted(StringBuilder json, String value)
+ {
+ json.append('"').append(value).append('"');
+ }
+
+ // Escape a string so that it can be pasted into JSON structure.
+ // For example, if `str` only contains a new-line character, then the result content is "\n"
+ // (4 characters).
+ private static String escapeJson(String value)
+ {
+ try (CharArrayWriter writer = new CharArrayWriter();
+ JsonGenerator generator = jsonFactory().createGenerator(writer)) {
+ generator.writeString(value);
+ generator.flush();
+ return writer.toString();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java
new file mode 100644
index 000000000000..c17c33f647f0
--- /dev/null
+++ b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java
@@ -0,0 +1,480 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.parquet.spark;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java
+ *
+ * This class defines constants related to the variant format and provides functions for
+ * manipulating variant binaries.
+
+ * A variant is made up of 2 binaries: value and metadata. A variant value consists of a one-byte
+ * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits
+ * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in
+ * the below constants for all possible basic type and type info values.
+
+ * The variant metadata includes a version id and a dictionary of distinct strings (case-sensitive).
+ * Its binary format is:
+ * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently.
+ * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the
+ * dictionary.
+ * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the
+ * starting position of string i, counting starting from the address of `offsets[0]`. Strings
+ * must be stored contiguously, so we don’t need to store the string size, instead, we compute it
+ * with `offset[i + 1] - offset[i]`.
+ * - UTF-8 string data.
+ */
+public final class VariantUtil
+{
+ public static final int BASIC_TYPE_BITS = 2;
+ public static final int BASIC_TYPE_MASK = 0x3;
+ public static final int TYPE_INFO_MASK = 0x3F;
+
+ // Below is all possible basic type values.
+ // Primitive value. The type info value must be one of the values in the below section.
+ public static final int PRIMITIVE = 0;
+ // Short string value. The type info value is the string size, which must be in `[0,
+ // kMaxShortStrSize]`.
+ // The string content bytes directly follow the header byte.
+ public static final int SHORT_STR = 1;
+ // Object value. The content contains a size, a list of field ids, a list of field offsets, and
+ // the actual field data. The length of the id list is `size`, while the length of the offset
+ // list is `size + 1`, where the last offset represent the total size of the field data. The
+ // fields in an object must be sorted by the field name in alphabetical order. Duplicate field
+ // names in one object are not allowed.
+ // We use 5 bits in the type info to specify the integer type of the object header: it should
+ // be 0_b4_b3b2_b1b0 (MSB is 0), where:
+ // - b4 specifies the type of size. When it is 0/1, `size` is a little-endian 1/4-byte
+ // unsigned integer.
+ // - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits are 0/1/2, the
+ // list contains 1/2/3-byte little-endian unsigned integers.
+ public static final int OBJECT = 2;
+ // Array value. The content contains a size, a list of field offsets, and the actual element
+ // data. It is similar to an object without the id list. The length of the offset list
+ // is `size + 1`, where the last offset represent the total size of the element data.
+ // Its type info should be: 000_b2_b1b0:
+ // - b2 specifies the type of size.
+ // - b1b0 specifies the integer type of offset.
+ public static final int ARRAY = 3;
+
+ // Below is all possible type info values for `PRIMITIVE`.
+ // JSON Null value. Empty content.
+ public static final int NULL = 0;
+ // True value. Empty content.
+ public static final int TRUE = 1;
+ // False value. Empty content.
+ public static final int FALSE = 2;
+ // 1-byte little-endian signed integer.
+ public static final int INT1 = 3;
+ // 2-byte little-endian signed integer.
+ public static final int INT2 = 4;
+ // 4-byte little-endian signed integer.
+ public static final int INT4 = 5;
+ // 4-byte little-endian signed integer.
+ public static final int INT8 = 6;
+ // 8-byte IEEE double.
+ public static final int DOUBLE = 7;
+ // 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer.
+ public static final int DECIMAL4 = 8;
+ // 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer.
+ public static final int DECIMAL8 = 9;
+ // 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer.
+ public static final int DECIMAL16 = 10;
+ // Date value. Content is 4-byte little-endian signed integer that represents the number of days
+ // from the Unix epoch.
+ public static final int DATE = 11;
+ // Timestamp value. Content is 8-byte little-endian signed integer that represents the number of
+ // microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in
+ // their local time zones and may be displayed differently depending on the execution environment.
+ public static final int TIMESTAMP = 12;
+ // Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted
+ // as if the local time zone is UTC.
+ public static final int TIMESTAMP_NTZ = 13;
+ // 4-byte IEEE float.
+ public static final int FLOAT = 14;
+ // Binary value. The content is (4-byte little-endian unsigned integer representing the binary
+ // size) + (size bytes of binary content).
+ public static final int BINARY = 15;
+ // Long string value. The content is (4-byte little-endian unsigned integer representing the
+ // string size) + (size bytes of string content).
+ public static final int LONG_STR = 16;
+
+ public static final byte VERSION = 1;
+ // The lower 4 bits of the first metadata byte contain the version.
+ public static final byte VERSION_MASK = 0x0F;
+
+ public static final int U24_MAX = 0xFFFFFF;
+ public static final int U32_SIZE = 4;
+
+ // Both variant value and variant metadata need to be no longer than 16MiB.
+ public static final int SIZE_LIMIT = U24_MAX + 1;
+
+ public static final int MAX_DECIMAL4_PRECISION = 9;
+ public static final int MAX_DECIMAL8_PRECISION = 18;
+ public static final int MAX_DECIMAL16_PRECISION = 38;
+
+ private VariantUtil() {}
+
+ // Check the validity of an array index `position`. Throw `MALFORMED_VARIANT` if it is out of bound,
+ // meaning that the variant is malformed.
+ static void checkIndex(int position, int length)
+ {
+ if (position < 0 || position >= length) {
+ throw new IllegalArgumentException("Index out of bound: %s (length: %s)".formatted(position, length));
+ }
+ }
+
+ // Read a little-endian signed long value from `bytes[position, position + numBytes)`.
+ static long readLong(byte[] bytes, int position, int numBytes)
+ {
+ checkIndex(position, bytes.length);
+ checkIndex(position + numBytes - 1, bytes.length);
+ long result = 0;
+ // All bytes except the most significant byte should be unsign-extended and shifted (so we need
+ // `& 0xFF`). The most significant byte should be sign-extended and is handled after the loop.
+ for (int i = 0; i < numBytes - 1; ++i) {
+ long unsignedByteValue = bytes[position + i] & 0xFF;
+ result |= unsignedByteValue << (8 * i);
+ }
+ long signedByteValue = bytes[position + numBytes - 1];
+ result |= signedByteValue << (8 * (numBytes - 1));
+ return result;
+ }
+
+ // Read a little-endian unsigned int value from `bytes[position, position + numBytes)`. The value must fit
+ // into a non-negative int (`[0, Integer.MAX_VALUE]`).
+ static int readUnsigned(byte[] bytes, int position, int numBytes)
+ {
+ checkIndex(position, bytes.length);
+ checkIndex(position + numBytes - 1, bytes.length);
+ int result = 0;
+ // Similar to the `readLong` loop, but all bytes should be unsign-extended.
+ for (int i = 0; i < numBytes; ++i) {
+ int unsignedByteValue = bytes[position + i] & 0xFF;
+ result |= unsignedByteValue << (8 * i);
+ }
+ if (result < 0) {
+ throw new IllegalArgumentException("Value out of bound: %s".formatted(result));
+ }
+ return result;
+ }
+
+ // The value type of variant value. It is determined by the header byte but not a 1:1 mapping
+ // (for example, INT1/2/4/8 all maps to `Type.LONG`).
+ public enum Type
+ {
+ NULL,
+ BOOLEAN,
+ LONG,
+ FLOAT,
+ DOUBLE,
+ DECIMAL,
+ STRING,
+ BINARY,
+ DATE,
+ TIMESTAMP,
+ TIMESTAMP_NTZ,
+ ARRAY,
+ OBJECT,
+ }
+
+ // Get the value type of variant value `value[position...]`. It is only legal to call `get*` if
+ // `getType` returns this type (for example, it is only legal to call `getLong` if `getType`
+ // returns `Type.Long`).
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static Type getType(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ return switch (basicType) {
+ case SHORT_STR -> Type.STRING;
+ case OBJECT -> Type.OBJECT;
+ case ARRAY -> Type.ARRAY;
+ default -> switch (typeInfo) {
+ case NULL -> Type.NULL;
+ case TRUE, FALSE -> Type.BOOLEAN;
+ case INT1, INT2, INT4, INT8 -> Type.LONG;
+ case DOUBLE -> Type.DOUBLE;
+ case DECIMAL4, DECIMAL8, DECIMAL16 -> Type.DECIMAL;
+ case DATE -> Type.DATE;
+ case TIMESTAMP -> Type.TIMESTAMP;
+ case TIMESTAMP_NTZ -> Type.TIMESTAMP_NTZ;
+ case FLOAT -> Type.FLOAT;
+ case BINARY -> Type.BINARY;
+ case LONG_STR -> Type.STRING;
+ default -> throw new IllegalArgumentException("Unexpected type: " + typeInfo);
+ };
+ };
+ }
+
+ private static IllegalStateException unexpectedType(Type type)
+ {
+ return new IllegalStateException("Expect type to be " + type);
+ }
+
+ // Get a boolean value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static boolean getBoolean(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) {
+ throw unexpectedType(Type.BOOLEAN);
+ }
+ return typeInfo == TRUE;
+ }
+
+ // Get a long value from variant value `value[position...]`.
+ // It is only legal to call it if `getType` returns one of `Type.LONG/DATE/TIMESTAMP/
+ // TIMESTAMP_NTZ`. If the type is `DATE`, the return value is guaranteed to fit into an int and
+ // represents the number of days from the Unix epoch. If the type is `TIMESTAMP/TIMESTAMP_NTZ`,
+ // the return value represents the number of microseconds from the Unix epoch.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static long getLong(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ";
+ if (basicType != PRIMITIVE) {
+ throw new IllegalStateException(exceptionMessage);
+ }
+ return switch (typeInfo) {
+ case INT1 -> readLong(value, position + 1, 1);
+ case INT2 -> readLong(value, position + 1, 2);
+ case INT4, DATE -> readLong(value, position + 1, 4);
+ case INT8, TIMESTAMP, TIMESTAMP_NTZ -> readLong(value, position + 1, 8);
+ default -> throw new IllegalStateException(exceptionMessage);
+ };
+ }
+
+ // Get a double value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static double getDouble(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != DOUBLE) {
+ throw unexpectedType(Type.DOUBLE);
+ }
+ return Double.longBitsToDouble(readLong(value, position + 1, 8));
+ }
+
+ // Check whether the precision and scale of the decimal are within the limit.
+ private static void checkDecimal(BigDecimal decimal, int maxPrecision)
+ {
+ if (decimal.precision() > maxPrecision || decimal.scale() > maxPrecision) {
+ throw new IllegalArgumentException("Decimal out of bound: " + decimal);
+ }
+ }
+
+ // Get a decimal value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static BigDecimal getDecimal(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE) {
+ throw unexpectedType(Type.DECIMAL);
+ }
+ // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be
+ // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`.
+ int scale = value[position + 1] & 0xFF;
+ BigDecimal result;
+ switch (typeInfo) {
+ case DECIMAL4:
+ result = BigDecimal.valueOf(readLong(value, position + 2, 4), scale);
+ checkDecimal(result, MAX_DECIMAL4_PRECISION);
+ break;
+ case DECIMAL8:
+ result = BigDecimal.valueOf(readLong(value, position + 2, 8), scale);
+ checkDecimal(result, MAX_DECIMAL8_PRECISION);
+ break;
+ case DECIMAL16:
+ checkIndex(position + 17, value.length);
+ byte[] bytes = new byte[16];
+ // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian
+ // representation.
+ for (int i = 0; i < 16; ++i) {
+ bytes[i] = value[position + 17 - i];
+ }
+ result = new BigDecimal(new BigInteger(bytes), scale);
+ checkDecimal(result, MAX_DECIMAL16_PRECISION);
+ break;
+ default:
+ throw unexpectedType(Type.DECIMAL);
+ }
+ return result.stripTrailingZeros();
+ }
+
+ // Get a float value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static float getFloat(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != FLOAT) {
+ throw unexpectedType(Type.FLOAT);
+ }
+ return Float.intBitsToFloat((int) readLong(value, position + 1, 4));
+ }
+
+ // Get a binary value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static byte[] getBinary(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != BINARY) {
+ throw unexpectedType(Type.BINARY);
+ }
+ int start = position + 1 + U32_SIZE;
+ int length = readUnsigned(value, position + 1, U32_SIZE);
+ checkIndex(start + length - 1, value.length);
+ return Arrays.copyOfRange(value, start, start + length);
+ }
+
+ // Get a string value from variant value `value[position...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static String getString(byte[] value, int position)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) {
+ int start;
+ int length;
+ if (basicType == SHORT_STR) {
+ start = position + 1;
+ length = typeInfo;
+ }
+ else {
+ start = position + 1 + U32_SIZE;
+ length = readUnsigned(value, position + 1, U32_SIZE);
+ }
+ checkIndex(start + length - 1, value.length);
+ return new String(value, start, length, UTF_8);
+ }
+ throw unexpectedType(Type.STRING);
+ }
+
+ public interface ObjectHandler
+ {
+ /**
+ * @param size Number of object fields.
+ * @param idSize The integer size of the field id list.
+ * @param offsetSize The integer size of the offset list.
+ * @param idStart The starting index of the field id list in the variant value array.
+ * @param offsetStart The starting index of the offset list in the variant value array.
+ * @param dataStart The starting index of field data in the variant value array.
+ */
+ T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart);
+ }
+
+ // A helper function to access a variant object. It provides `handler` with its required
+ // parameters and returns what it returns.
+ public static T handleObject(byte[] value, int position, ObjectHandler handler)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != OBJECT) {
+ throw unexpectedType(Type.OBJECT);
+ }
+ // Refer to the comment of the `OBJECT` constant for the details of the object header encoding.
+ // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts
+ // b4 to determine whether the object uses a 1/4-byte size.
+ boolean largeSize = ((typeInfo >> 4) & 0x1) != 0;
+ int sizeBytes = (largeSize ? U32_SIZE : 1);
+ int size = readUnsigned(value, position + 1, sizeBytes);
+ // Extracts b3b2 to determine the integer size of the field id list.
+ int idSize = ((typeInfo >> 2) & 0x3) + 1;
+ // Extracts b1b0 to determine the integer size of the offset list.
+ int offsetSize = (typeInfo & 0x3) + 1;
+ int idStart = position + 1 + sizeBytes;
+ int offsetStart = idStart + size * idSize;
+ int dataStart = offsetStart + (size + 1) * offsetSize;
+ return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart);
+ }
+
+ public interface ArrayHandler
+ {
+ /**
+ * @param size Number of array elements.
+ * @param offsetSize The integer size of the offset list.
+ * @param offsetStart The starting index of the offset list in the variant value array.
+ * @param dataStart The starting index of element data in the variant value array.
+ */
+ T apply(int size, int offsetSize, int offsetStart, int dataStart);
+ }
+
+ // A helper function to access a variant array.
+ public static T handleArray(byte[] value, int position, ArrayHandler handler)
+ {
+ checkIndex(position, value.length);
+ int basicType = value[position] & BASIC_TYPE_MASK;
+ int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != ARRAY) {
+ throw unexpectedType(Type.ARRAY);
+ }
+ // Refer to the comment of the `ARRAY` constant for the details of the object header encoding.
+ // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts
+ // b2 to determine whether the object uses a 1/4-byte size.
+ boolean largeSize = ((typeInfo >> 2) & 0x1) != 0;
+ int sizeBytes = (largeSize ? U32_SIZE : 1);
+ int size = readUnsigned(value, position + 1, sizeBytes);
+ // Extracts b1b0 to determine the integer size of the offset list.
+ int offsetSize = (typeInfo & 0x3) + 1;
+ int offsetStart = position + 1 + sizeBytes;
+ int dataStart = offsetStart + (size + 1) * offsetSize;
+ return handler.apply(size, offsetSize, offsetStart, dataStart);
+ }
+
+ // Get a key at `id` in the variant metadata.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed. An out-of-bound `id` is also considered
+ // a malformed variant because it is read from the corresponding variant value.
+ public static String getMetadataKey(byte[] metadata, int id)
+ {
+ checkIndex(0, metadata.length);
+ // Extracts the highest 2 bits in the metadata header to determine the integer size of the
+ // offset list.
+ int offsetSize = ((metadata[0] >> 6) & 0x3) + 1;
+ int dictSize = readUnsigned(metadata, 1, offsetSize);
+ if (id >= dictSize) {
+ throw new IllegalArgumentException("Index out of bound: %s (size: %s)".formatted(id, dictSize));
+ }
+ // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets
+ // before the string data.
+ int stringStart = 1 + (dictSize + 2) * offsetSize;
+ int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize);
+ int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize);
+ if (offset > nextOffset) {
+ throw new IllegalArgumentException("Invalid offset: %s > %s".formatted(offset, nextOffset));
+ }
+ checkIndex(stringStart + nextOffset - 1, metadata.length);
+ return new String(metadata, stringStart + offset, nextOffset - offset, UTF_8);
+ }
+}
diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java b/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java
new file mode 100644
index 000000000000..fb963f39fef2
--- /dev/null
+++ b/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.parquet.spark;
+
+import org.intellij.lang.annotations.Language;
+import org.junit.jupiter.api.Test;
+
+import java.time.ZoneOffset;
+
+import static java.time.ZoneOffset.UTC;
+import static org.assertj.core.api.Assertions.assertThat;
+
+final class TestVariant
+{
+ @Test
+ void testVariantToJson()
+ {
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 1}, new byte[] {1, 1, 0, 3, 99, 111, 108}, "{\"col\":1}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 2}, new byte[] {1, 1, 0, 5, 97, 114, 114, 97, 121}, "{\"array\":2}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 3}, new byte[] {1, 1, 0, 3, 109, 97, 112}, "{\"map\":3}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 4}, new byte[] {1, 1, 0, 6, 115, 116, 114, 117, 99, 116}, "{\"struct\":4}");
+
+ assertVariantToJson(new byte[] {12, 1}, new byte[] {1, 0, 0}, "1");
+ assertVariantToJson(new byte[] {12, 2}, new byte[] {1, 0, 0}, "2");
+ assertVariantToJson(new byte[] {12, 3}, new byte[] {1, 0, 0}, "3");
+ assertVariantToJson(new byte[] {12, 4}, new byte[] {1, 0, 0}, "4");
+ }
+
+ @Test
+ void testVariantToJsonAllTypes()
+ {
+ assertVariantToJson(new byte[] {4}, new byte[] {1, 0, 0}, "true");
+ assertVariantToJson(new byte[] {12, 1}, new byte[] {1, 0, 0}, "1");
+ assertVariantToJson(new byte[] {56, -51, -52, 76, 62}, new byte[] {1, 0, 0}, "0.2");
+ assertVariantToJson(new byte[] {28, 51, 51, 51, 51, 51, 51, -45, 63}, new byte[] {1, 0, 0}, "0.3");
+ assertVariantToJson(new byte[] {32, 1, 4, 0, 0, 0}, new byte[] {1, 0, 0}, "0.4");
+ assertVariantToJson(new byte[] {37, 116, 101, 115, 116, 32, 100, 97, 116, 97}, new byte[] {1, 0, 0}, "\"test data\"");
+ assertVariantToJson(new byte[] {60, 3, 0, 0, 0, 101, 104, 63}, new byte[] {1, 0, 0}, "\"ZWg/\"");
+ assertVariantToJson(new byte[] {44, -27, 72, 0, 0}, new byte[] {1, 0, 0}, "\"2021-02-03\"");
+ assertVariantToJson(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}, "\"2001-08-22 01:02:03.321+00:00\"");
+ assertVariantToJson(new byte[] {52, 64, -34, -104, 21, -22, -73, 5, 0}, new byte[] {1, 0, 0}, "\"2021-01-02 12:34:56.123456\"");
+ assertVariantToJson(new byte[] {3, 1, 0, 2, 12, 1}, new byte[] {1, 0, 0}, "[1]");
+ assertVariantToJson(new byte[] {2, 2, 0, 1, 0, 2, 4, 12, 1, 12, 2}, new byte[] {1, 2, 0, 4, 8, 107, 101, 121, 49, 107, 101, 121, 50}, "{\"key1\":1,\"key2\":2}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 1}, new byte[] {1, 1, 0, 1, 120}, "{\"x\":1}");
+ }
+
+ @Test
+ void testVariantToJsonTimestamp()
+ {
+ assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(-18)))
+ .isEqualTo("\"2001-08-21 07:02:03.321-18:00\"");
+ assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(-1)))
+ .isEqualTo("\"2001-08-22 00:02:03.321-01:00\"");
+ assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(UTC))
+ .isEqualTo("\"2001-08-22 01:02:03.321+00:00\"");
+ assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(1)))
+ .isEqualTo("\"2001-08-22 02:02:03.321+01:00\"");
+ assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(18)))
+ .isEqualTo("\"2001-08-22 19:02:03.321+18:00\"");
+ }
+
+ @Test
+ void testVariantToJsonNullValue()
+ {
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 1, 0}, new byte[] {1, 1, 0, 3, 99, 111, 108}, "{\"col\":null}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 5, 97, 114, 114, 97, 121}, "{\"array\":null}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 3, 109, 97, 112}, "{\"map\":null}");
+ assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 6, 115, 116, 114, 117, 99, 116}, "{\"struct\":null}");
+ }
+
+ private static void assertVariantToJson(byte[] value, byte[] metadata, @Language("JSON") String expectedJson)
+ {
+ assertThat(new Variant(value, metadata).toJson(UTC))
+ .isEqualTo(expectedJson);
+ }
+}
diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java
index 8b31d8a6bf1e..82512f5c9119 100644
--- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java
+++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java
@@ -29,6 +29,7 @@
import io.trino.spi.type.TypeSignatureParameter;
import io.trino.spi.type.VarcharType;
+import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.metastore.HiveType.HIVE_BINARY;
@@ -42,6 +43,7 @@
import static io.trino.metastore.HiveType.HIVE_SHORT;
import static io.trino.metastore.HiveType.HIVE_STRING;
import static io.trino.metastore.HiveType.HIVE_TIMESTAMP;
+import static io.trino.metastore.HiveType.HIVE_VARIANT;
import static io.trino.metastore.type.CharTypeInfo.MAX_CHAR_LENGTH;
import static io.trino.metastore.type.TypeInfoFactory.getCharTypeInfo;
import static io.trino.metastore.type.TypeInfoFactory.getListTypeInfo;
@@ -57,6 +59,7 @@
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
+import static io.trino.spi.type.StandardTypes.JSON;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static java.lang.String.format;
@@ -129,6 +132,10 @@ public static TypeInfo translate(Type type)
if (type instanceof DecimalType decimalType) {
return new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale());
}
+ if (type.getTypeSignature().getBase().equals(JSON)) {
+ checkArgument(type.getTypeSignature().getParameters().isEmpty(), "JSON type should not have parameters");
+ return HIVE_VARIANT.getTypeInfo();
+ }
if (type instanceof ArrayType arrayType) {
TypeInfo elementType = translate(arrayType.getElementType());
return getListTypeInfo(elementType);
diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java
index d9fc72df070a..39b6e37c2e2a 100644
--- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java
+++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java
@@ -86,6 +86,7 @@
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
+import static io.trino.spi.type.StandardTypes.JSON;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static io.trino.spi.type.TinyintType.TINYINT;
@@ -765,7 +766,7 @@ private static Type buildType(TypeManager typeManager, JsonNode typeNode, boolea
// For more info, see https://delta-users.slack.com/archives/GKTUWT03T/p1585760533005400
// and https://cwiki.apache.org/confluence/display/Hive/Different+TIMESTAMP+types
case "timestamp" -> TIMESTAMP_TZ_MILLIS;
- case "variant" -> throw new UnsupportedTypeException("variant");
+ case "variant" -> typeManager.getType(new TypeSignature(JSON));
default -> throw new TypeNotFoundException(new TypeSignature(primitiveType));
};
}
diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java
index 0f0b29f84340..8c3da6fcc062 100644
--- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java
+++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java
@@ -133,6 +133,7 @@ public class TestDeltaLakeBasic
new ResourceTable("unsupported_writer_feature", "deltalake/unsupported_writer_feature"),
new ResourceTable("unsupported_writer_version", "deltalake/unsupported_writer_version"),
new ResourceTable("variant", "databricks153/variant"),
+ new ResourceTable("variant_types", "databricks153/variant_types"),
new ResourceTable("type_widening", "databricks153/type_widening"),
new ResourceTable("type_widening_partition", "databricks153/type_widening_partition"),
new ResourceTable("type_widening_nested", "databricks153/type_widening_nested"));
@@ -1504,14 +1505,65 @@ public void testUniFormIcebergV2()
@Test
public void testVariant()
{
- // TODO (https://github.com/trinodb/trino/issues/22309) Add support for variant type
assertThat(query("DESCRIBE variant")).result().projected("Column", "Type")
.skippingTypesCheck()
- .matches("VALUES ('col_int', 'integer'), ('col_string', 'varchar')");
+ .matches("VALUES " +
+ "('col_int', 'integer')," +
+ "('simple_variant', 'json')," +
+ "('array_variant', 'array(json)')," +
+ "('map_variant', 'map(varchar, json)')," +
+ "('struct_variant', 'row(x json)')," +
+ "('col_string', 'varchar')");
+
+ assertThat(query("SELECT col_int, simple_variant, array_variant[1], map_variant['key1'], struct_variant.x, col_string FROM variant"))
+ .skippingTypesCheck()
+ .matches("VALUES " +
+ "(1, JSON '{\"col\":1}', JSON '{\"array\":2}', JSON '{\"map\":3}', JSON '{\"struct\":4}', 'test data')," +
+ "(2, JSON '{\"col\":null}', JSON '{\"array\":null}', JSON '{\"map\":null}', JSON '{\"struct\":null}', 'test null data')," +
+ "(3, NULL, NULL, NULL, NULL, 'test null')," +
+ "(4, JSON '1', JSON '2', JSON '3', JSON '4', 'test without fields')");
- assertQuery("SELECT * FROM variant", "VALUES (1, 'test data')");
+ assertQueryFails("INSERT INTO variant VALUES (2, null, null, null, null, 'new data')", "Unsupported writer features: .*");
+ }
- assertQueryFails("INSERT INTO variant VALUES (2, 'new data')", "Unsupported writer features: .*");
+ /**
+ * @see databricks153.variant_types
+ */
+ @Test
+ public void testVariantTypes()
+ {
+ assertThat(query("""
+ SELECT
+ col_boolean,
+ col_long,
+ col_float,
+ col_double,
+ col_decimal,
+ col_string,
+ col_binary,
+ col_date,
+ col_timestamp,
+ col_timestampntz,
+ col_array,
+ col_map,
+ col_struct
+ FROM variant_types"""))
+ .skippingTypesCheck()
+ .matches("""
+ VALUES
+ ('true',
+ '1',
+ '0.2',
+ '0.3',
+ '0.4',
+ '"test data"',
+ '"ZWg/"',
+ '"2021-02-03"',
+ '"2001-08-21 19:02:03.321-06:00"',
+ '"2021-01-02 12:34:56.123456"',
+ '[1]',
+ '{"key1":1,"key2":2}',
+ '{"x":1}')""");
}
@Test
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md
index 65935f0c8ac3..62015da9be4a 100644
--- a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md
@@ -8,4 +8,13 @@ LOCATION ?;
INSERT INTO default.test_variant
VALUES (1, parse_json('{"col":1}'), array(parse_json('{"array":2}')), map('key1', parse_json('{"map":3}')), named_struct('x', parse_json('{"struct":4}')), 'test data');
+
+INSERT INTO default.test_variant
+VALUES (2, parse_json('{"col":null}'), array(parse_json('{"array":null}')), map('key1', parse_json('{"map":null}')), named_struct('x', parse_json('{"struct":null}')), 'test null data');
+
+INSERT INTO default.test_variant
+VALUES (3, parse_json(NULL), array(NULL), map('key1', NULL), named_struct('x', NULL), 'test null');
+
+INSERT INTO default.test_variant
+VALUES (4, parse_json('1'), array(parse_json('2')), map('key1', parse_json('3')), named_struct('x', parse_json('4')), 'test without fields');
```
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json
new file mode 100644
index 000000000000..2bd3f1f5801a
--- /dev/null
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json
@@ -0,0 +1,2 @@
+{"commitInfo":{"timestamp":1718586112107,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"3377"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"58f6af45-8063-4fae-9018-95bfe6cbd561"}}
+{"add":{"path":"part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet","partitionValues":{},"size":3377,"modificationTime":1718586111000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":2,\"col_string\":\"test null data\"},\"maxValues\":{\"col_int\":2,\"col_string\":\"test null data\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":0,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":0},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718586111000000","MIN_INSERTION_TIME":"1718586111000000","MAX_INSERTION_TIME":"1718586111000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json
new file mode 100644
index 000000000000..66839aa9fb0f
--- /dev/null
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json
@@ -0,0 +1,2 @@
+{"commitInfo":{"timestamp":1718586451540,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"2849"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"43f39922-3699-49fe-9b32-a1556bbef0a0"}}
+{"add":{"path":"part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet","partitionValues":{},"size":2849,"modificationTime":1718586452000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":3,\"col_string\":\"test null\"},\"maxValues\":{\"col_int\":3,\"col_string\":\"test null\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":1,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":1},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718586452000000","MIN_INSERTION_TIME":"1718586452000000","MAX_INSERTION_TIME":"1718586452000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json
new file mode 100644
index 000000000000..7d8d016f9b08
--- /dev/null
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json
@@ -0,0 +1,2 @@
+{"commitInfo":{"timestamp":1718587514563,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"3156"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"75d31328-c821-4273-af1b-4d7f5d15cdd8"}}
+{"add":{"path":"part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet","partitionValues":{},"size":3156,"modificationTime":1718587515000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":4,\"col_string\":\"test without fields\"},\"maxValues\":{\"col_int\":4,\"col_string\":\"test without fields\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":0,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":0},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718587515000000","MIN_INSERTION_TIME":"1718587515000000","MAX_INSERTION_TIME":"1718587515000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet
new file mode 100644
index 000000000000..1917a2760127
Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet differ
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet
new file mode 100644
index 000000000000..6c58889f533e
Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet differ
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet
new file mode 100644
index 000000000000..cd4a81cc6c7a
Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet differ
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md
new file mode 100644
index 000000000000..c18728cf598a
--- /dev/null
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md
@@ -0,0 +1,21 @@
+Data generated using Databricks 15.3:
+
+```sql
+CREATE TABLE default.test_variant_types
+USING delta
+LOCATION ?
+AS SELECT
+ CAST(true AS variant) AS col_boolean,
+ CAST(1 AS variant) AS col_long,
+ CAST(CAST('0.2' AS float) AS variant) AS col_float,
+ CAST(CAST('0.3' AS double) AS variant) AS col_double,
+ CAST(CAST('0.4' AS decimal) AS variant) AS col_decimal,
+ CAST('test data' AS variant) AS col_string,
+ CAST(X'65683F' AS variant) AS col_binary,
+ CAST(date '2021-02-03' AS variant) AS col_date,
+ CAST(timestamp '2001-08-22 01:02:03.321 UTC' AS variant) AS col_timestamp,
+ CAST(timestamp_ntz '2021-01-02 12:34:56.123456' AS variant) AS col_timestampntz,
+ CAST(array(1) AS variant) AS col_array,
+ CAST(map('key1', 1, 'key2', 2) AS variant) AS col_map,
+ CAST(named_struct('x', 1) AS variant) AS col_struct;
+```
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json
new file mode 100644
index 000000000000..31a0a76acbf8
--- /dev/null
+++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json
@@ -0,0 +1,4 @@
+{"commitInfo":{"timestamp":1718597262586,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"CREATE TABLE AS SELECT","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableDeletionVectors\":\"true\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"6331"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"a5f86d81-443e-4374-a377-699ca48ae6da"}}
+{"metaData":{"id":"ea47d678-864d-4652-8957-3099f5cf3153","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"col_boolean\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_long\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_float\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_double\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_decimal\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_string\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_binary\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_date\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_timestamp\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_timestampntz\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_array\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_map\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_struct\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true"},"createdTime":1718597261906}}
+{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","variantType-preview"],"writerFeatures":["deletionVectors","variantType-preview"]}}
+{"add":{"path":"part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet","partitionValues":{},"size":6331,"modificationTime":1718597263000,"dataChange":true,"stats":"{\"numRecords\":1,\"nullCount\":{\"col_boolean\":0,\"col_long\":0,\"col_float\":0,\"col_double\":0,\"col_decimal\":0,\"col_string\":0,\"col_binary\":0,\"col_date\":0,\"col_timestamp\":0,\"col_timestampntz\":0,\"col_array\":0,\"col_map\":0,\"col_struct\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718597263000000","MIN_INSERTION_TIME":"1718597263000000","MAX_INSERTION_TIME":"1718597263000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet
new file mode 100644
index 000000000000..c0713ee5ce1f
Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet differ
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java
index a8f28eb3cf45..05532e0b4afc 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java
@@ -137,7 +137,7 @@ static HiveColumnStatistics fromMetastoreColumnStatistics(String columnName, Hiv
toDecimal(parameters.get(field + COLUMN_MAX)),
nullsCount,
distinctValuesWithNullCount);
- case TIMESTAMPLOCALTZ, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VOID, UNKNOWN -> HiveColumnStatistics.empty();
+ case TIMESTAMPLOCALTZ, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VARIANT, VOID, UNKNOWN -> HiveColumnStatistics.empty();
};
}
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java
index 1e377956c03d..6ca070dcdca9 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java
@@ -789,6 +789,7 @@ public static ColumnStatisticsObj createMetastoreColumnStatistics(String columnN
case TIMESTAMPLOCALTZ:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
+ case VARIANT:
// TODO support these, when we add support for these Hive types
case VOID:
case UNKNOWN:
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java
index a1ee3ced114a..8cf3ddbce48b 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java
@@ -186,6 +186,7 @@ private static int hash(TypeInfo type, Object value)
case TIMESTAMPLOCALTZ:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
+ case VARIANT:
// TODO
break;
case VOID:
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java
index 6989489b2707..c8ae02fc6461 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java
@@ -188,6 +188,7 @@ private static int hash(TypeInfo type, Object value)
case TIMESTAMPLOCALTZ:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
+ case VARIANT:
// TODO
break;
case VOID:
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java
index 1dd0c9b7aeea..c7ca54693a58 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java
@@ -107,6 +107,7 @@ private static boolean typeSupported(PrimitiveCategory category)
case INTERVAL_YEAR_MONTH,
INTERVAL_DAY_TIME,
VOID,
+ VARIANT,
UNKNOWN -> false;
};
}
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java
index fc6cf307e54a..3cd9b2619016 100644
--- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java
+++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java
@@ -304,6 +304,7 @@ private static boolean isWritablePrimitiveType(PrimitiveCategory primitiveCatego
case TIMESTAMPLOCALTZ:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
+ case VARIANT:
case UNKNOWN:
// unsupported for writing
break;