diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java b/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java index 2f3bdb3a87bf..21247b772d94 100644 --- a/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java +++ b/lib/trino-metastore/src/main/java/io/trino/metastore/HiveType.java @@ -34,6 +34,7 @@ import static io.trino.metastore.type.TypeConstants.TIMESTAMPLOCALTZ_TYPE_NAME; import static io.trino.metastore.type.TypeConstants.TIMESTAMP_TYPE_NAME; import static io.trino.metastore.type.TypeConstants.TINYINT_TYPE_NAME; +import static io.trino.metastore.type.TypeConstants.VARIANT_TYPE_NAME; import static io.trino.metastore.type.TypeInfoFactory.getPrimitiveTypeInfo; import static io.trino.metastore.type.TypeInfoUtils.getTypeInfoFromTypeString; import static io.trino.metastore.type.TypeInfoUtils.getTypeInfosFromTypeString; @@ -55,6 +56,7 @@ public final class HiveType public static final HiveType HIVE_TIMESTAMPLOCALTZ = new HiveType(getPrimitiveTypeInfo(TIMESTAMPLOCALTZ_TYPE_NAME)); public static final HiveType HIVE_DATE = new HiveType(getPrimitiveTypeInfo(DATE_TYPE_NAME)); public static final HiveType HIVE_BINARY = new HiveType(getPrimitiveTypeInfo(BINARY_TYPE_NAME)); + public static final HiveType HIVE_VARIANT = new HiveType(getPrimitiveTypeInfo(VARIANT_TYPE_NAME)); private final HiveTypeName hiveTypeName; private final TypeInfo typeInfo; diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java index f0336cff7272..dac44e8c5592 100644 --- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java +++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/PrimitiveCategory.java @@ -18,5 +18,5 @@ public enum PrimitiveCategory { VOID, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, DATE, TIMESTAMP, TIMESTAMPLOCALTZ, BINARY, DECIMAL, VARCHAR, CHAR, - INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, UNKNOWN + INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VARIANT, UNKNOWN } diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java index df41b965ce14..147e9f1f54a4 100644 --- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java +++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeConstants.java @@ -35,6 +35,7 @@ private TypeConstants() {} public static final String BINARY_TYPE_NAME = "binary"; public static final String INTERVAL_YEAR_MONTH_TYPE_NAME = "interval_year_month"; public static final String INTERVAL_DAY_TIME_TYPE_NAME = "interval_day_time"; + public static final String VARIANT_TYPE_NAME = "variant"; public static final String LIST_TYPE_NAME = "array"; public static final String MAP_TYPE_NAME = "map"; diff --git a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java index dc1a94fc910a..bb351d433650 100644 --- a/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java +++ b/lib/trino-metastore/src/main/java/io/trino/metastore/type/TypeInfoUtils.java @@ -61,6 +61,7 @@ private TypeInfoUtils() {} registerType(new PrimitiveTypeEntry(PrimitiveCategory.INTERVAL_YEAR_MONTH, TypeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME)); registerType(new PrimitiveTypeEntry(PrimitiveCategory.INTERVAL_DAY_TIME, TypeConstants.INTERVAL_DAY_TIME_TYPE_NAME)); registerType(new PrimitiveTypeEntry(PrimitiveCategory.DECIMAL, TypeConstants.DECIMAL_TYPE_NAME)); + registerType(new PrimitiveTypeEntry(PrimitiveCategory.VARIANT, TypeConstants.VARIANT_TYPE_NAME)); registerType(new PrimitiveTypeEntry(PrimitiveCategory.UNKNOWN, "unknown")); } diff --git a/lib/trino-parquet/pom.xml b/lib/trino-parquet/pom.xml index 9cecbda33c17..f53194af3944 100644 --- a/lib/trino-parquet/pom.xml +++ b/lib/trino-parquet/pom.xml @@ -13,6 +13,11 @@ Trino - Parquet file format support + + com.fasterxml.jackson.core + jackson-core + + com.google.errorprone error_prone_annotations diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java b/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java index 34980fd6af82..85ce8a555875 100644 --- a/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/ParquetTypeUtils.java @@ -39,8 +39,11 @@ import java.util.Map; import java.util.Optional; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.StandardTypes.JSON; +import static io.trino.spi.type.VarbinaryType.VARBINARY; import static java.lang.String.format; import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; import static org.apache.parquet.schema.Type.Repetition.REPEATED; @@ -291,6 +294,15 @@ private static Optional constructField(Type type, ColumnIO columnIO, bool boolean required = columnIO.getType().getRepetition() != OPTIONAL; int repetitionLevel = columnIO.getRepetitionLevel(); int definitionLevel = columnIO.getDefinitionLevel(); + if (isVariantType(type, columnIO)) { + checkArgument(type.getTypeParameters().isEmpty(), "Expected type parameters to be empty for variant but got %s", type.getTypeParameters()); + if (!(columnIO instanceof GroupColumnIO groupColumnIo)) { + throw new IllegalStateException("Expected columnIO to be GroupColumnIO but got %s".formatted(columnIO.getClass().getSimpleName())); + } + Field valueField = constructField(VARBINARY, groupColumnIo.getChild(0), false).orElseThrow(); + Field metadataField = constructField(VARBINARY, groupColumnIo.getChild(1), false).orElseThrow(); + return Optional.of(new VariantField(type, repetitionLevel, definitionLevel, required, valueField, metadataField)); + } if (type instanceof RowType rowType) { GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; ImmutableList.Builder> fieldsBuilder = ImmutableList.builder(); @@ -350,4 +362,13 @@ private static Optional constructField(Type type, ColumnIO columnIO, bool } return Optional.of(new PrimitiveField(type, required, primitiveColumnIO.getColumnDescriptor(), primitiveColumnIO.getId())); } + + private static boolean isVariantType(Type type, ColumnIO columnIO) + { + return type.getTypeSignature().getBase().equals(JSON) && + columnIO instanceof GroupColumnIO groupColumnIo && + groupColumnIo.getChildrenCount() == 2 && + groupColumnIo.getChild("value") != null && + groupColumnIo.getChild("metadata") != null; + } } diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java b/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java new file mode 100644 index 000000000000..f9b65baf4ca9 --- /dev/null +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/VariantField.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.spi.type.Type; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class VariantField + extends Field +{ + private final Field value; + private final Field metadata; + + public VariantField(Type type, int repetitionLevel, int definitionLevel, boolean required, Field value, Field metadata) + { + super(type, repetitionLevel, definitionLevel, required); + this.value = requireNonNull(value, "value is null"); + this.metadata = requireNonNull(metadata, "metadata is null"); + } + + public Field getValue() + { + return value; + } + + public Field getMetadata() + { + return metadata; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", getType()) + .add("repetitionLevel", getRepetitionLevel()) + .add("definitionLevel", getDefinitionLevel()) + .add("required", isRequired()) + .add("value", value) + .add("metadata", getMetadata()) + .toString(); + } +} diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java index c77d013fdc35..5ba0976581bf 100644 --- a/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/reader/ParquetReader.java @@ -19,6 +19,7 @@ import com.google.common.collect.ListMultimap; import com.google.errorprone.annotations.FormatMethod; import io.airlift.log.Logger; +import io.airlift.slice.Slice; import io.trino.memory.context.AggregatedMemoryContext; import io.trino.parquet.ChunkKey; import io.trino.parquet.Column; @@ -30,14 +31,17 @@ import io.trino.parquet.ParquetReaderOptions; import io.trino.parquet.ParquetWriteValidation; import io.trino.parquet.PrimitiveField; +import io.trino.parquet.VariantField; import io.trino.parquet.metadata.ColumnChunkMetadata; import io.trino.parquet.metadata.PrunedBlockMetadata; import io.trino.parquet.predicate.TupleDomainParquetPredicate; import io.trino.parquet.reader.FilteredOffsetIndex.OffsetRange; +import io.trino.parquet.spark.Variant; import io.trino.plugin.base.metrics.LongCount; import io.trino.spi.Page; import io.trino.spi.block.ArrayBlock; import io.trino.spi.block.Block; +import io.trino.spi.block.BlockBuilder; import io.trino.spi.block.DictionaryBlock; import io.trino.spi.block.RowBlock; import io.trino.spi.block.RunLengthEncodedBlock; @@ -59,6 +63,7 @@ import java.io.Closeable; import java.io.IOException; +import java.time.ZoneId; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -69,6 +74,7 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.airlift.slice.Slices.utf8Slice; import static io.trino.parquet.ParquetValidationUtils.validateParquet; import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation; import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation.createStatisticsValidationBuilder; @@ -76,6 +82,8 @@ import static io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder; import static io.trino.parquet.reader.ListColumnReader.calculateCollectionOffsets; import static io.trino.parquet.reader.PageReader.createPageReader; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.Math.max; import static java.lang.Math.min; import static java.lang.Math.toIntExact; @@ -97,6 +105,7 @@ public class ParquetReader private final List columnFields; private final List primitiveFields; private final ParquetDataSource dataSource; + private final ZoneId zoneId; private final ColumnReaderFactory columnReaderFactory; private final AggregatedMemoryContext memoryContext; @@ -149,6 +158,7 @@ public ParquetReader( this.primitiveFields = getPrimitiveFields(columnFields.stream().map(Column::field).collect(toImmutableList())); this.rowGroups = requireNonNull(rowGroups, "rowGroups is null"); this.dataSource = requireNonNull(dataSource, "dataSource is null"); + this.zoneId = requireNonNull(timeZone, "timeZone is null").toTimeZone().toZoneId(); this.columnReaderFactory = new ColumnReaderFactory(timeZone, options); this.memoryContext = requireNonNull(memoryContext, "memoryContext is null"); this.currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext(); @@ -332,6 +342,25 @@ private void freeCurrentRowGroupBuffers() } } + private ColumnChunk readVariant(VariantField field) + throws IOException + { + ColumnChunk valueChunk = readColumnChunk(field.getValue()); + + BlockBuilder variantBlock = VARCHAR.createBlockBuilder(null, 1); + if (valueChunk.getBlock().getPositionCount() == 0) { + variantBlock.appendNull(); + } + else { + ColumnChunk metadataChunk = readColumnChunk(field.getMetadata()); + Slice value = VARBINARY.getSlice(valueChunk.getBlock(), 0); + Slice metadata = VARBINARY.getSlice(metadataChunk.getBlock(), 0); + Variant variant = new Variant(value.byteArray(), metadata.byteArray()); + VARCHAR.writeSlice(variantBlock, utf8Slice(variant.toJson(zoneId))); + } + return new ColumnChunk(variantBlock.build(), valueChunk.getDefinitionLevels(), valueChunk.getRepetitionLevels()); + } + private ColumnChunk readArray(GroupField field) throws IOException { @@ -523,6 +552,10 @@ else if (field instanceof GroupField groupField) { .flatMap(Optional::stream) .forEach(child -> parseField(child, primitiveFields)); } + else if (field instanceof VariantField variantField) { + parseField(variantField.getValue(), primitiveFields); + parseField(variantField.getMetadata(), primitiveFields); + } } public Block readBlock(Field field) @@ -535,7 +568,10 @@ private ColumnChunk readColumnChunk(Field field) throws IOException { ColumnChunk columnChunk; - if (field.getType() instanceof RowType) { + if (field instanceof VariantField variantField) { + columnChunk = readVariant(variantField); + } + else if (field.getType() instanceof RowType) { columnChunk = readStruct((GroupField) field); } else if (field.getType() instanceof MapType) { diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java new file mode 100644 index 000000000000..12b7d6a69817 --- /dev/null +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/Variant.java @@ -0,0 +1,172 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.spark; + +import com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.util.Base64; +import java.util.Locale; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.spark.VariantUtil.SIZE_LIMIT; +import static io.trino.parquet.spark.VariantUtil.VERSION; +import static io.trino.parquet.spark.VariantUtil.VERSION_MASK; +import static io.trino.parquet.spark.VariantUtil.getBinary; +import static io.trino.parquet.spark.VariantUtil.getBoolean; +import static io.trino.parquet.spark.VariantUtil.getDecimal; +import static io.trino.parquet.spark.VariantUtil.getDouble; +import static io.trino.parquet.spark.VariantUtil.getFloat; +import static io.trino.parquet.spark.VariantUtil.getLong; +import static io.trino.parquet.spark.VariantUtil.getMetadataKey; +import static io.trino.parquet.spark.VariantUtil.getString; +import static io.trino.parquet.spark.VariantUtil.getType; +import static io.trino.parquet.spark.VariantUtil.handleArray; +import static io.trino.parquet.spark.VariantUtil.handleObject; +import static io.trino.parquet.spark.VariantUtil.readUnsigned; +import static io.trino.plugin.base.util.JsonUtils.jsonFactory; +import static java.time.format.DateTimeFormatter.ISO_LOCAL_DATE; +import static java.time.format.DateTimeFormatter.ISO_LOCAL_TIME; +import static java.time.temporal.ChronoUnit.MICROS; + +/** + * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java + * and adjusted the code style. + */ +public final class Variant +{ + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(ISO_LOCAL_DATE) + .appendLiteral(' ') + .append(ISO_LOCAL_TIME) + .toFormatter(Locale.US); + + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + private final byte[] value; + private final byte[] metadata; + // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and + // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary + // when reading a sub-variant in the array/object element. + private final int position; + + public Variant(byte[] value, byte[] metadata) + { + this(value, metadata, 0); + } + + private Variant(byte[] value, byte[] metadata, int position) + { + this.value = value; + this.metadata = metadata; + this.position = position; + checkArgument(metadata.length >= 1, "metadata must be present"); + checkArgument((metadata[0] & VERSION_MASK) == VERSION, "metadata version must be %s", VERSION); + // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks memory instability. + checkArgument(metadata.length <= SIZE_LIMIT, "max metadata size is %s: %s", SIZE_LIMIT, metadata.length); + checkArgument(value.length <= SIZE_LIMIT, "max value size is %s: %s", SIZE_LIMIT, value.length); + } + + // Stringify the variant in JSON format. + public String toJson(ZoneId zoneId) + { + StringBuilder json = new StringBuilder(); + toJsonImpl(value, metadata, position, json, zoneId); + return json.toString(); + } + + private static void toJsonImpl(byte[] value, byte[] metadata, int position, StringBuilder json, ZoneId zoneId) + { + switch (getType(value, position)) { + case NULL -> json.append("null"); + case BOOLEAN -> json.append(getBoolean(value, position)); + case LONG -> json.append(getLong(value, position)); + case FLOAT -> json.append(getFloat(value, position)); + case DOUBLE -> json.append(getDouble(value, position)); + case DECIMAL -> json.append(getDecimal(value, position).toPlainString()); + case STRING -> json.append(escapeJson(getString(value, position))); + case BINARY -> appendQuoted(json, Base64.getEncoder().encodeToString(getBinary(value, position))); + case DATE -> appendQuoted(json, LocalDate.ofEpochDay(getLong(value, position)).toString()); + case TIMESTAMP -> appendQuoted(json, TIMESTAMP_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(zoneId))); + case TIMESTAMP_NTZ -> appendQuoted(json, TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(ZoneOffset.UTC))); + case ARRAY -> handleArray(value, position, (size, offsetSize, offsetStart, dataStart) -> { + json.append('['); + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) { + json.append(','); + } + toJsonImpl(value, metadata, elementPos, json, zoneId); + } + json.append(']'); + return null; + }); + case OBJECT -> handleObject(value, position, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + json.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPosition = dataStart + offset; + if (i != 0) { + json.append(','); + } + json.append(escapeJson(getMetadataKey(metadata, id))); + json.append(':'); + toJsonImpl(value, metadata, elementPosition, json, zoneId); + } + json.append('}'); + return null; + }); + } + } + + private static Instant microsToInstant(long timestamp) + { + return Instant.EPOCH.plus(timestamp, MICROS); + } + + // A simplified and more performant version of `sb.append(escapeJson(value))`. It is used when we + // know `value` doesn't contain any special character that needs escaping. + private static void appendQuoted(StringBuilder json, String value) + { + json.append('"').append(value).append('"'); + } + + // Escape a string so that it can be pasted into JSON structure. + // For example, if `str` only contains a new-line character, then the result content is "\n" + // (4 characters). + private static String escapeJson(String value) + { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator generator = jsonFactory().createGenerator(writer)) { + generator.writeString(value); + generator.flush(); + return writer.toString(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java new file mode 100644 index 000000000000..c17c33f647f0 --- /dev/null +++ b/lib/trino-parquet/src/main/java/io/trino/parquet/spark/VariantUtil.java @@ -0,0 +1,480 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.spark; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.Arrays; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java + * + * This class defines constants related to the variant format and provides functions for + * manipulating variant binaries. + + * A variant is made up of 2 binaries: value and metadata. A variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + + * The variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +public final class VariantUtil +{ + public static final int BASIC_TYPE_BITS = 2; + public static final int BASIC_TYPE_MASK = 0x3; + public static final int TYPE_INFO_MASK = 0x3F; + + // Below is all possible basic type values. + // Primitive value. The type info value must be one of the values in the below section. + public static final int PRIMITIVE = 0; + // Short string value. The type info value is the string size, which must be in `[0, + // kMaxShortStrSize]`. + // The string content bytes directly follow the header byte. + public static final int SHORT_STR = 1; + // Object value. The content contains a size, a list of field ids, a list of field offsets, and + // the actual field data. The length of the id list is `size`, while the length of the offset + // list is `size + 1`, where the last offset represent the total size of the field data. The + // fields in an object must be sorted by the field name in alphabetical order. Duplicate field + // names in one object are not allowed. + // We use 5 bits in the type info to specify the integer type of the object header: it should + // be 0_b4_b3b2_b1b0 (MSB is 0), where: + // - b4 specifies the type of size. When it is 0/1, `size` is a little-endian 1/4-byte + // unsigned integer. + // - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits are 0/1/2, the + // list contains 1/2/3-byte little-endian unsigned integers. + public static final int OBJECT = 2; + // Array value. The content contains a size, a list of field offsets, and the actual element + // data. It is similar to an object without the id list. The length of the offset list + // is `size + 1`, where the last offset represent the total size of the element data. + // Its type info should be: 000_b2_b1b0: + // - b2 specifies the type of size. + // - b1b0 specifies the integer type of offset. + public static final int ARRAY = 3; + + // Below is all possible type info values for `PRIMITIVE`. + // JSON Null value. Empty content. + public static final int NULL = 0; + // True value. Empty content. + public static final int TRUE = 1; + // False value. Empty content. + public static final int FALSE = 2; + // 1-byte little-endian signed integer. + public static final int INT1 = 3; + // 2-byte little-endian signed integer. + public static final int INT2 = 4; + // 4-byte little-endian signed integer. + public static final int INT4 = 5; + // 4-byte little-endian signed integer. + public static final int INT8 = 6; + // 8-byte IEEE double. + public static final int DOUBLE = 7; + // 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. + public static final int DECIMAL4 = 8; + // 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. + public static final int DECIMAL8 = 9; + // 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. + public static final int DECIMAL16 = 10; + // Date value. Content is 4-byte little-endian signed integer that represents the number of days + // from the Unix epoch. + public static final int DATE = 11; + // Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + // microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + // their local time zones and may be displayed differently depending on the execution environment. + public static final int TIMESTAMP = 12; + // Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + // as if the local time zone is UTC. + public static final int TIMESTAMP_NTZ = 13; + // 4-byte IEEE float. + public static final int FLOAT = 14; + // Binary value. The content is (4-byte little-endian unsigned integer representing the binary + // size) + (size bytes of binary content). + public static final int BINARY = 15; + // Long string value. The content is (4-byte little-endian unsigned integer representing the + // string size) + (size bytes of string content). + public static final int LONG_STR = 16; + + public static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + public static final int U24_MAX = 0xFFFFFF; + public static final int U32_SIZE = 4; + + // Both variant value and variant metadata need to be no longer than 16MiB. + public static final int SIZE_LIMIT = U24_MAX + 1; + + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + private VariantUtil() {} + + // Check the validity of an array index `position`. Throw `MALFORMED_VARIANT` if it is out of bound, + // meaning that the variant is malformed. + static void checkIndex(int position, int length) + { + if (position < 0 || position >= length) { + throw new IllegalArgumentException("Index out of bound: %s (length: %s)".formatted(position, length)); + } + } + + // Read a little-endian signed long value from `bytes[position, position + numBytes)`. + static long readLong(byte[] bytes, int position, int numBytes) + { + checkIndex(position, bytes.length); + checkIndex(position + numBytes - 1, bytes.length); + long result = 0; + // All bytes except the most significant byte should be unsign-extended and shifted (so we need + // `& 0xFF`). The most significant byte should be sign-extended and is handled after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = bytes[position + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = bytes[position + numBytes - 1]; + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + // Read a little-endian unsigned int value from `bytes[position, position + numBytes)`. The value must fit + // into a non-negative int (`[0, Integer.MAX_VALUE]`). + static int readUnsigned(byte[] bytes, int position, int numBytes) + { + checkIndex(position, bytes.length); + checkIndex(position + numBytes - 1, bytes.length); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsign-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes[position + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) { + throw new IllegalArgumentException("Value out of bound: %s".formatted(result)); + } + return result; + } + + // The value type of variant value. It is determined by the header byte but not a 1:1 mapping + // (for example, INT1/2/4/8 all maps to `Type.LONG`). + public enum Type + { + NULL, + BOOLEAN, + LONG, + FLOAT, + DOUBLE, + DECIMAL, + STRING, + BINARY, + DATE, + TIMESTAMP, + TIMESTAMP_NTZ, + ARRAY, + OBJECT, + } + + // Get the value type of variant value `value[position...]`. It is only legal to call `get*` if + // `getType` returns this type (for example, it is only legal to call `getLong` if `getType` + // returns `Type.Long`). + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static Type getType(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + return switch (basicType) { + case SHORT_STR -> Type.STRING; + case OBJECT -> Type.OBJECT; + case ARRAY -> Type.ARRAY; + default -> switch (typeInfo) { + case NULL -> Type.NULL; + case TRUE, FALSE -> Type.BOOLEAN; + case INT1, INT2, INT4, INT8 -> Type.LONG; + case DOUBLE -> Type.DOUBLE; + case DECIMAL4, DECIMAL8, DECIMAL16 -> Type.DECIMAL; + case DATE -> Type.DATE; + case TIMESTAMP -> Type.TIMESTAMP; + case TIMESTAMP_NTZ -> Type.TIMESTAMP_NTZ; + case FLOAT -> Type.FLOAT; + case BINARY -> Type.BINARY; + case LONG_STR -> Type.STRING; + default -> throw new IllegalArgumentException("Unexpected type: " + typeInfo); + }; + }; + } + + private static IllegalStateException unexpectedType(Type type) + { + return new IllegalStateException("Expect type to be " + type); + } + + // Get a boolean value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static boolean getBoolean(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { + throw unexpectedType(Type.BOOLEAN); + } + return typeInfo == TRUE; + } + + // Get a long value from variant value `value[position...]`. + // It is only legal to call it if `getType` returns one of `Type.LONG/DATE/TIMESTAMP/ + // TIMESTAMP_NTZ`. If the type is `DATE`, the return value is guaranteed to fit into an int and + // represents the number of days from the Unix epoch. If the type is `TIMESTAMP/TIMESTAMP_NTZ`, + // the return value represents the number of microseconds from the Unix epoch. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static long getLong(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; + if (basicType != PRIMITIVE) { + throw new IllegalStateException(exceptionMessage); + } + return switch (typeInfo) { + case INT1 -> readLong(value, position + 1, 1); + case INT2 -> readLong(value, position + 1, 2); + case INT4, DATE -> readLong(value, position + 1, 4); + case INT8, TIMESTAMP, TIMESTAMP_NTZ -> readLong(value, position + 1, 8); + default -> throw new IllegalStateException(exceptionMessage); + }; + } + + // Get a double value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static double getDouble(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != DOUBLE) { + throw unexpectedType(Type.DOUBLE); + } + return Double.longBitsToDouble(readLong(value, position + 1, 8)); + } + + // Check whether the precision and scale of the decimal are within the limit. + private static void checkDecimal(BigDecimal decimal, int maxPrecision) + { + if (decimal.precision() > maxPrecision || decimal.scale() > maxPrecision) { + throw new IllegalArgumentException("Decimal out of bound: " + decimal); + } + } + + // Get a decimal value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static BigDecimal getDecimal(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE) { + throw unexpectedType(Type.DECIMAL); + } + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[position + 1] & 0xFF; + BigDecimal result; + switch (typeInfo) { + case DECIMAL4: + result = BigDecimal.valueOf(readLong(value, position + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); + break; + case DECIMAL8: + result = BigDecimal.valueOf(readLong(value, position + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); + break; + case DECIMAL16: + checkIndex(position + 17, value.length); + byte[] bytes = new byte[16]; + // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian + // representation. + for (int i = 0; i < 16; ++i) { + bytes[i] = value[position + 17 - i]; + } + result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); + break; + default: + throw unexpectedType(Type.DECIMAL); + } + return result.stripTrailingZeros(); + } + + // Get a float value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static float getFloat(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != FLOAT) { + throw unexpectedType(Type.FLOAT); + } + return Float.intBitsToFloat((int) readLong(value, position + 1, 4)); + } + + // Get a binary value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static byte[] getBinary(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != BINARY) { + throw unexpectedType(Type.BINARY); + } + int start = position + 1 + U32_SIZE; + int length = readUnsigned(value, position + 1, U32_SIZE); + checkIndex(start + length - 1, value.length); + return Arrays.copyOfRange(value, start, start + length); + } + + // Get a string value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static String getString(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { + int start; + int length; + if (basicType == SHORT_STR) { + start = position + 1; + length = typeInfo; + } + else { + start = position + 1 + U32_SIZE; + length = readUnsigned(value, position + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.length); + return new String(value, start, length, UTF_8); + } + throw unexpectedType(Type.STRING); + } + + public interface ObjectHandler + { + /** + * @param size Number of object fields. + * @param idSize The integer size of the field id list. + * @param offsetSize The integer size of the offset list. + * @param idStart The starting index of the field id list in the variant value array. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of field data in the variant value array. + */ + T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + } + + // A helper function to access a variant object. It provides `handler` with its required + // parameters and returns what it returns. + public static T handleObject(byte[] value, int position, ObjectHandler handler) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != OBJECT) { + throw unexpectedType(Type.OBJECT); + } + // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts + // b4 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, position + 1, sizeBytes); + // Extracts b3b2 to determine the integer size of the field id list. + int idSize = ((typeInfo >> 2) & 0x3) + 1; + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int idStart = position + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + public interface ArrayHandler + { + /** + * @param size Number of array elements. + * @param offsetSize The integer size of the offset list. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of element data in the variant value array. + */ + T apply(int size, int offsetSize, int offsetStart, int dataStart); + } + + // A helper function to access a variant array. + public static T handleArray(byte[] value, int position, ArrayHandler handler) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != ARRAY) { + throw unexpectedType(Type.ARRAY); + } + // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts + // b2 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, position + 1, sizeBytes); + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int offsetStart = position + 1 + sizeBytes; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, offsetSize, offsetStart, dataStart); + } + + // Get a key at `id` in the variant metadata. + // Throw `MALFORMED_VARIANT` if the variant is malformed. An out-of-bound `id` is also considered + // a malformed variant because it is read from the corresponding variant value. + public static String getMetadataKey(byte[] metadata, int id) + { + checkIndex(0, metadata.length); + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + if (id >= dictSize) { + throw new IllegalArgumentException("Index out of bound: %s (size: %s)".formatted(id, dictSize)); + } + // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets + // before the string data. + int stringStart = 1 + (dictSize + 2) * offsetSize; + int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + if (offset > nextOffset) { + throw new IllegalArgumentException("Invalid offset: %s > %s".formatted(offset, nextOffset)); + } + checkIndex(stringStart + nextOffset - 1, metadata.length); + return new String(metadata, stringStart + offset, nextOffset - offset, UTF_8); + } +} diff --git a/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java b/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java new file mode 100644 index 000000000000..fb963f39fef2 --- /dev/null +++ b/lib/trino-parquet/src/test/java/io/trino/parquet/spark/TestVariant.java @@ -0,0 +1,87 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.spark; + +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.Test; + +import java.time.ZoneOffset; + +import static java.time.ZoneOffset.UTC; +import static org.assertj.core.api.Assertions.assertThat; + +final class TestVariant +{ + @Test + void testVariantToJson() + { + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 1}, new byte[] {1, 1, 0, 3, 99, 111, 108}, "{\"col\":1}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 2}, new byte[] {1, 1, 0, 5, 97, 114, 114, 97, 121}, "{\"array\":2}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 3}, new byte[] {1, 1, 0, 3, 109, 97, 112}, "{\"map\":3}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 4}, new byte[] {1, 1, 0, 6, 115, 116, 114, 117, 99, 116}, "{\"struct\":4}"); + + assertVariantToJson(new byte[] {12, 1}, new byte[] {1, 0, 0}, "1"); + assertVariantToJson(new byte[] {12, 2}, new byte[] {1, 0, 0}, "2"); + assertVariantToJson(new byte[] {12, 3}, new byte[] {1, 0, 0}, "3"); + assertVariantToJson(new byte[] {12, 4}, new byte[] {1, 0, 0}, "4"); + } + + @Test + void testVariantToJsonAllTypes() + { + assertVariantToJson(new byte[] {4}, new byte[] {1, 0, 0}, "true"); + assertVariantToJson(new byte[] {12, 1}, new byte[] {1, 0, 0}, "1"); + assertVariantToJson(new byte[] {56, -51, -52, 76, 62}, new byte[] {1, 0, 0}, "0.2"); + assertVariantToJson(new byte[] {28, 51, 51, 51, 51, 51, 51, -45, 63}, new byte[] {1, 0, 0}, "0.3"); + assertVariantToJson(new byte[] {32, 1, 4, 0, 0, 0}, new byte[] {1, 0, 0}, "0.4"); + assertVariantToJson(new byte[] {37, 116, 101, 115, 116, 32, 100, 97, 116, 97}, new byte[] {1, 0, 0}, "\"test data\""); + assertVariantToJson(new byte[] {60, 3, 0, 0, 0, 101, 104, 63}, new byte[] {1, 0, 0}, "\"ZWg/\""); + assertVariantToJson(new byte[] {44, -27, 72, 0, 0}, new byte[] {1, 0, 0}, "\"2021-02-03\""); + assertVariantToJson(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}, "\"2001-08-22 01:02:03.321+00:00\""); + assertVariantToJson(new byte[] {52, 64, -34, -104, 21, -22, -73, 5, 0}, new byte[] {1, 0, 0}, "\"2021-01-02 12:34:56.123456\""); + assertVariantToJson(new byte[] {3, 1, 0, 2, 12, 1}, new byte[] {1, 0, 0}, "[1]"); + assertVariantToJson(new byte[] {2, 2, 0, 1, 0, 2, 4, 12, 1, 12, 2}, new byte[] {1, 2, 0, 4, 8, 107, 101, 121, 49, 107, 101, 121, 50}, "{\"key1\":1,\"key2\":2}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 12, 1}, new byte[] {1, 1, 0, 1, 120}, "{\"x\":1}"); + } + + @Test + void testVariantToJsonTimestamp() + { + assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(-18))) + .isEqualTo("\"2001-08-21 07:02:03.321-18:00\""); + assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(-1))) + .isEqualTo("\"2001-08-22 00:02:03.321-01:00\""); + assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(UTC)) + .isEqualTo("\"2001-08-22 01:02:03.321+00:00\""); + assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(1))) + .isEqualTo("\"2001-08-22 02:02:03.321+01:00\""); + assertThat(new Variant(new byte[] {48, -88, -34, 22, -20, 19, -116, 3, 0}, new byte[] {1, 0, 0}).toJson(ZoneOffset.ofHours(18))) + .isEqualTo("\"2001-08-22 19:02:03.321+18:00\""); + } + + @Test + void testVariantToJsonNullValue() + { + assertVariantToJson(new byte[] {2, 1, 0, 0, 1, 0}, new byte[] {1, 1, 0, 3, 99, 111, 108}, "{\"col\":null}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 5, 97, 114, 114, 97, 121}, "{\"array\":null}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 3, 109, 97, 112}, "{\"map\":null}"); + assertVariantToJson(new byte[] {2, 1, 0, 0, 2, 0}, new byte[] {1, 1, 0, 6, 115, 116, 114, 117, 99, 116}, "{\"struct\":null}"); + } + + private static void assertVariantToJson(byte[] value, byte[] metadata, @Language("JSON") String expectedJson) + { + assertThat(new Variant(value, metadata).toJson(UTC)) + .isEqualTo(expectedJson); + } +} diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java index 8b31d8a6bf1e..82512f5c9119 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/DeltaHiveTypeTranslator.java @@ -29,6 +29,7 @@ import io.trino.spi.type.TypeSignatureParameter; import io.trino.spi.type.VarcharType; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static io.trino.metastore.HiveType.HIVE_BINARY; @@ -42,6 +43,7 @@ import static io.trino.metastore.HiveType.HIVE_SHORT; import static io.trino.metastore.HiveType.HIVE_STRING; import static io.trino.metastore.HiveType.HIVE_TIMESTAMP; +import static io.trino.metastore.HiveType.HIVE_VARIANT; import static io.trino.metastore.type.CharTypeInfo.MAX_CHAR_LENGTH; import static io.trino.metastore.type.TypeInfoFactory.getCharTypeInfo; import static io.trino.metastore.type.TypeInfoFactory.getListTypeInfo; @@ -57,6 +59,7 @@ import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.RealType.REAL; import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.StandardTypes.JSON; import static io.trino.spi.type.TinyintType.TINYINT; import static io.trino.spi.type.VarbinaryType.VARBINARY; import static java.lang.String.format; @@ -129,6 +132,10 @@ public static TypeInfo translate(Type type) if (type instanceof DecimalType decimalType) { return new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale()); } + if (type.getTypeSignature().getBase().equals(JSON)) { + checkArgument(type.getTypeSignature().getParameters().isEmpty(), "JSON type should not have parameters"); + return HIVE_VARIANT.getTypeInfo(); + } if (type instanceof ArrayType arrayType) { TypeInfo elementType = translate(arrayType.getElementType()); return getListTypeInfo(elementType); diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java index d9fc72df070a..39b6e37c2e2a 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeSchemaSupport.java @@ -86,6 +86,7 @@ import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.RealType.REAL; import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.StandardTypes.JSON; import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS; import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; import static io.trino.spi.type.TinyintType.TINYINT; @@ -765,7 +766,7 @@ private static Type buildType(TypeManager typeManager, JsonNode typeNode, boolea // For more info, see https://delta-users.slack.com/archives/GKTUWT03T/p1585760533005400 // and https://cwiki.apache.org/confluence/display/Hive/Different+TIMESTAMP+types case "timestamp" -> TIMESTAMP_TZ_MILLIS; - case "variant" -> throw new UnsupportedTypeException("variant"); + case "variant" -> typeManager.getType(new TypeSignature(JSON)); default -> throw new TypeNotFoundException(new TypeSignature(primitiveType)); }; } diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java index 0f0b29f84340..8c3da6fcc062 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeBasic.java @@ -133,6 +133,7 @@ public class TestDeltaLakeBasic new ResourceTable("unsupported_writer_feature", "deltalake/unsupported_writer_feature"), new ResourceTable("unsupported_writer_version", "deltalake/unsupported_writer_version"), new ResourceTable("variant", "databricks153/variant"), + new ResourceTable("variant_types", "databricks153/variant_types"), new ResourceTable("type_widening", "databricks153/type_widening"), new ResourceTable("type_widening_partition", "databricks153/type_widening_partition"), new ResourceTable("type_widening_nested", "databricks153/type_widening_nested")); @@ -1504,14 +1505,65 @@ public void testUniFormIcebergV2() @Test public void testVariant() { - // TODO (https://github.com/trinodb/trino/issues/22309) Add support for variant type assertThat(query("DESCRIBE variant")).result().projected("Column", "Type") .skippingTypesCheck() - .matches("VALUES ('col_int', 'integer'), ('col_string', 'varchar')"); + .matches("VALUES " + + "('col_int', 'integer')," + + "('simple_variant', 'json')," + + "('array_variant', 'array(json)')," + + "('map_variant', 'map(varchar, json)')," + + "('struct_variant', 'row(x json)')," + + "('col_string', 'varchar')"); + + assertThat(query("SELECT col_int, simple_variant, array_variant[1], map_variant['key1'], struct_variant.x, col_string FROM variant")) + .skippingTypesCheck() + .matches("VALUES " + + "(1, JSON '{\"col\":1}', JSON '{\"array\":2}', JSON '{\"map\":3}', JSON '{\"struct\":4}', 'test data')," + + "(2, JSON '{\"col\":null}', JSON '{\"array\":null}', JSON '{\"map\":null}', JSON '{\"struct\":null}', 'test null data')," + + "(3, NULL, NULL, NULL, NULL, 'test null')," + + "(4, JSON '1', JSON '2', JSON '3', JSON '4', 'test without fields')"); - assertQuery("SELECT * FROM variant", "VALUES (1, 'test data')"); + assertQueryFails("INSERT INTO variant VALUES (2, null, null, null, null, 'new data')", "Unsupported writer features: .*"); + } - assertQueryFails("INSERT INTO variant VALUES (2, 'new data')", "Unsupported writer features: .*"); + /** + * @see databricks153.variant_types + */ + @Test + public void testVariantTypes() + { + assertThat(query(""" + SELECT + col_boolean, + col_long, + col_float, + col_double, + col_decimal, + col_string, + col_binary, + col_date, + col_timestamp, + col_timestampntz, + col_array, + col_map, + col_struct + FROM variant_types""")) + .skippingTypesCheck() + .matches(""" + VALUES + ('true', + '1', + '0.2', + '0.3', + '0.4', + '"test data"', + '"ZWg/"', + '"2021-02-03"', + '"2001-08-21 19:02:03.321-06:00"', + '"2021-01-02 12:34:56.123456"', + '[1]', + '{"key1":1,"key2":2}', + '{"x":1}')"""); } @Test diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md index 65935f0c8ac3..62015da9be4a 100644 --- a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/README.md @@ -8,4 +8,13 @@ LOCATION ?; INSERT INTO default.test_variant VALUES (1, parse_json('{"col":1}'), array(parse_json('{"array":2}')), map('key1', parse_json('{"map":3}')), named_struct('x', parse_json('{"struct":4}')), 'test data'); + +INSERT INTO default.test_variant +VALUES (2, parse_json('{"col":null}'), array(parse_json('{"array":null}')), map('key1', parse_json('{"map":null}')), named_struct('x', parse_json('{"struct":null}')), 'test null data'); + +INSERT INTO default.test_variant +VALUES (3, parse_json(NULL), array(NULL), map('key1', NULL), named_struct('x', NULL), 'test null'); + +INSERT INTO default.test_variant +VALUES (4, parse_json('1'), array(parse_json('2')), map('key1', parse_json('3')), named_struct('x', parse_json('4')), 'test without fields'); ``` diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json new file mode 100644 index 000000000000..2bd3f1f5801a --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1718586112107,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"3377"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"58f6af45-8063-4fae-9018-95bfe6cbd561"}} +{"add":{"path":"part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet","partitionValues":{},"size":3377,"modificationTime":1718586111000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":2,\"col_string\":\"test null data\"},\"maxValues\":{\"col_int\":2,\"col_string\":\"test null data\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":0,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":0},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718586111000000","MIN_INSERTION_TIME":"1718586111000000","MAX_INSERTION_TIME":"1718586111000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json new file mode 100644 index 000000000000..66839aa9fb0f --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1718586451540,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"2849"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"43f39922-3699-49fe-9b32-a1556bbef0a0"}} +{"add":{"path":"part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet","partitionValues":{},"size":2849,"modificationTime":1718586452000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":3,\"col_string\":\"test null\"},\"maxValues\":{\"col_int\":3,\"col_string\":\"test null\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":1,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":1},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718586452000000","MIN_INSERTION_TIME":"1718586452000000","MAX_INSERTION_TIME":"1718586452000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json new file mode 100644 index 000000000000..7d8d016f9b08 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1718587514563,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"3156"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"75d31328-c821-4273-af1b-4d7f5d15cdd8"}} +{"add":{"path":"part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet","partitionValues":{},"size":3156,"modificationTime":1718587515000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"col_int\":4,\"col_string\":\"test without fields\"},\"maxValues\":{\"col_int\":4,\"col_string\":\"test without fields\"},\"nullCount\":{\"col_int\":0,\"simple_variant\":0,\"array_variant\":0,\"map_variant\":0,\"struct_variant\":{\"x\":0},\"col_string\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718587515000000","MIN_INSERTION_TIME":"1718587515000000","MAX_INSERTION_TIME":"1718587515000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet new file mode 100644 index 000000000000..1917a2760127 Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-090dbdcd-fc4e-47c9-8549-59cf79d9c7e6-c000.snappy.parquet differ diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet new file mode 100644 index 000000000000..6c58889f533e Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-4d01f20d-41d2-49fb-b98e-251a4c0984aa-c000.snappy.parquet differ diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet new file mode 100644 index 000000000000..cd4a81cc6c7a Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant/part-00000-81478783-cb9b-4719-975e-027177ae86e3-c000.snappy.parquet differ diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md new file mode 100644 index 000000000000..c18728cf598a --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/README.md @@ -0,0 +1,21 @@ +Data generated using Databricks 15.3: + +```sql +CREATE TABLE default.test_variant_types +USING delta +LOCATION ? +AS SELECT + CAST(true AS variant) AS col_boolean, + CAST(1 AS variant) AS col_long, + CAST(CAST('0.2' AS float) AS variant) AS col_float, + CAST(CAST('0.3' AS double) AS variant) AS col_double, + CAST(CAST('0.4' AS decimal) AS variant) AS col_decimal, + CAST('test data' AS variant) AS col_string, + CAST(X'65683F' AS variant) AS col_binary, + CAST(date '2021-02-03' AS variant) AS col_date, + CAST(timestamp '2001-08-22 01:02:03.321 UTC' AS variant) AS col_timestamp, + CAST(timestamp_ntz '2021-01-02 12:34:56.123456' AS variant) AS col_timestampntz, + CAST(array(1) AS variant) AS col_array, + CAST(map('key1', 1, 'key2', 2) AS variant) AS col_map, + CAST(named_struct('x', 1) AS variant) AS col_struct; +``` diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json new file mode 100644 index 000000000000..31a0a76acbf8 --- /dev/null +++ b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1718597262586,"userId":"7853186923043731","userName":"yuya.ebihara@starburstdata.com","operation":"CREATE TABLE AS SELECT","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableDeletionVectors\":\"true\"}","statsOnLoad":false},"notebook":{"notebookId":"1841155838656679"},"clusterId":"0607-024930-gxd23c26","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"6331"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/15.3.x-scala2.12","txnId":"a5f86d81-443e-4374-a377-699ca48ae6da"}} +{"metaData":{"id":"ea47d678-864d-4652-8957-3099f5cf3153","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"col_boolean\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_long\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_float\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_double\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_decimal\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_string\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_binary\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_date\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_timestamp\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_timestampntz\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_array\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_map\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col_struct\",\"type\":\"variant\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true"},"createdTime":1718597261906}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","variantType-preview"],"writerFeatures":["deletionVectors","variantType-preview"]}} +{"add":{"path":"part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet","partitionValues":{},"size":6331,"modificationTime":1718597263000,"dataChange":true,"stats":"{\"numRecords\":1,\"nullCount\":{\"col_boolean\":0,\"col_long\":0,\"col_float\":0,\"col_double\":0,\"col_decimal\":0,\"col_string\":0,\"col_binary\":0,\"col_date\":0,\"col_timestamp\":0,\"col_timestampntz\":0,\"col_array\":0,\"col_map\":0,\"col_struct\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1718597263000000","MIN_INSERTION_TIME":"1718597263000000","MAX_INSERTION_TIME":"1718597263000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet new file mode 100644 index 000000000000..c0713ee5ce1f Binary files /dev/null and b/plugin/trino-delta-lake/src/test/resources/databricks153/variant_types/part-00000-376865ac-f6c8-4f0f-9d9b-bb8a982e6c0b-c000.snappy.parquet differ diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java index a8f28eb3cf45..05532e0b4afc 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SparkMetastoreUtil.java @@ -137,7 +137,7 @@ static HiveColumnStatistics fromMetastoreColumnStatistics(String columnName, Hiv toDecimal(parameters.get(field + COLUMN_MAX)), nullsCount, distinctValuesWithNullCount); - case TIMESTAMPLOCALTZ, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VOID, UNKNOWN -> HiveColumnStatistics.empty(); + case TIMESTAMPLOCALTZ, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VARIANT, VOID, UNKNOWN -> HiveColumnStatistics.empty(); }; } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java index 1e377956c03d..6ca070dcdca9 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java @@ -789,6 +789,7 @@ public static ColumnStatisticsObj createMetastoreColumnStatistics(String columnN case TIMESTAMPLOCALTZ: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: + case VARIANT: // TODO support these, when we add support for these Hive types case VOID: case UNKNOWN: diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java index a1ee3ced114a..8cf3ddbce48b 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV1.java @@ -186,6 +186,7 @@ private static int hash(TypeInfo type, Object value) case TIMESTAMPLOCALTZ: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: + case VARIANT: // TODO break; case VOID: diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java index 6989489b2707..c8ae02fc6461 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveBucketingV2.java @@ -188,6 +188,7 @@ private static int hash(TypeInfo type, Object value) case TIMESTAMPLOCALTZ: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: + case VARIANT: // TODO break; case VOID: diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java index 1dd0c9b7aeea..c7ca54693a58 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java @@ -107,6 +107,7 @@ private static boolean typeSupported(PrimitiveCategory category) case INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VOID, + VARIANT, UNKNOWN -> false; }; } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java index fc6cf307e54a..3cd9b2619016 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/util/HiveWriteUtils.java @@ -304,6 +304,7 @@ private static boolean isWritablePrimitiveType(PrimitiveCategory primitiveCatego case TIMESTAMPLOCALTZ: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: + case VARIANT: case UNKNOWN: // unsupported for writing break;