Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/src/main/sphinx/connector/hive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,16 @@ The following operations are not supported when ``avro_schema_url`` is set:
* Bucketing(``bucketed_by``) columns are not supported in ``CREATE TABLE``.
* ``ALTER TABLE`` commands modifying columns are not supported.

ALTER TABLE
"""""""""""

.. _hive-set-column-type:

The connector supports changing column types with
``ALTER TABLE [ IF EXISTS ] name ALTER COLUMN column_name SET DATA TYPE new_type``.

The column change command will only modify Hive's metadata, and will not modify data.

.. _hive-alter-table-execute:

ALTER TABLE EXECUTE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import io.trino.spi.block.Block;
import io.trino.spi.block.ByteArrayBlock;
import io.trino.spi.block.IntArrayBlock;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Convert ORC block within ColumnAdaptation

The commit message bears no rational. in fact, what's the benefit of the change?

import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
import io.trino.spi.block.ShortArrayBlock;
import io.trino.spi.type.Type;

import javax.annotation.Nullable;
Expand All @@ -43,7 +45,9 @@
import static io.trino.orc.reader.ReaderUtils.minNonNullValueSize;
import static io.trino.orc.reader.ReaderUtils.verifyStreamType;
import static io.trino.orc.stream.MissingInputStreamSource.missingStreamSource;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TinyintType.TINYINT;
import static java.util.Objects.requireNonNull;

Expand Down Expand Up @@ -77,8 +81,7 @@ public ByteColumnReader(Type type, OrcColumn column, LocalMemoryContext memoryCo
throws OrcCorruptionException
{
this.type = requireNonNull(type, "type is null");
// Iceberg maps ORC tinyint type to integer
verifyStreamType(column, type, t -> t == TINYINT || t == INTEGER);
verifyStreamType(column, type, t -> t == TINYINT || t == SMALLINT || t == INTEGER || t == BIGINT);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i though the purpose of "Convert ORC block within ColumnAdaptation" was to have readers simplified, so surprised to see this change here.

I think I like the non-composed (without column adaptations) approach better though, so no changes requested here.


this.column = requireNonNull(column, "column is null");
this.memoryContext = requireNonNull(memoryContext, "memoryContext is null");
Expand Down Expand Up @@ -153,9 +156,15 @@ private Block readNonNullBlock()
if (type == TINYINT) {
return new ByteArrayBlock(nextBatchSize, Optional.empty(), values);
}
if (type == SMALLINT) {
return new ShortArrayBlock(nextBatchSize, Optional.empty(), convertToShortArray(values));
}
if (type == INTEGER) {
return new IntArrayBlock(nextBatchSize, Optional.empty(), convertToIntArray(values));
}
if (type == BIGINT) {
return new LongArrayBlock(nextBatchSize, Optional.empty(), convertToLongArray(values));
}
throw new VerifyError("Unsupported type " + type);
}

Expand All @@ -175,9 +184,15 @@ private Block readNullBlock(boolean[] isNull, int nonNullCount)
if (type == TINYINT) {
return new ByteArrayBlock(nextBatchSize, Optional.of(isNull), result);
}
if (type == SMALLINT) {
return new ShortArrayBlock(nextBatchSize, Optional.of(isNull), convertToShortArray(result));
}
if (type == INTEGER) {
return new IntArrayBlock(nextBatchSize, Optional.of(isNull), convertToIntArray(result));
}
if (type == BIGINT) {
return new LongArrayBlock(nextBatchSize, Optional.of(isNull), convertToLongArray(result));
}
throw new VerifyError("Unsupported type " + type);
}

Expand Down Expand Up @@ -220,6 +235,15 @@ public void startRowGroup(InputStreamSources dataStreamSources)
rowGroupOpen = false;
}

private static short[] convertToShortArray(byte[] bytes)
{
short[] values = new short[bytes.length];
for (int i = 0; i < bytes.length; i++) {
values[i] = bytes[i];
}
return values;
}

private static int[] convertToIntArray(byte[] bytes)
{
int[] values = new int[bytes.length];
Expand All @@ -229,6 +253,15 @@ private static int[] convertToIntArray(byte[] bytes)
return values;
}

private static long[] convertToLongArray(byte[] bytes)
Comment thread
raunaqmorarka marked this conversation as resolved.
Outdated
{
long[] values = new long[bytes.length];
for (int i = 0; i < bytes.length; i++) {
values[i] = bytes[i];
}
return values;
}

@Override
public String toString()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import static io.trino.orc.metadata.OrcType.OrcTypeKind.BINARY;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.LONG;
import static io.trino.orc.reader.ReaderUtils.invalidStreamType;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.TimeType.TIME_MICROS;

public final class ColumnReaders
Expand Down Expand Up @@ -65,9 +64,6 @@ public static ColumnReader createColumnReader(
case BOOLEAN:
return new BooleanColumnReader(type, column, memoryContext.newLocalMemoryContext(ColumnReaders.class.getSimpleName()));
case BYTE:
if (type == INTEGER && !column.getAttributes().containsKey("iceberg.id")) {
throw invalidStreamType(column, type);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't require iceberg.id attribute in ORC reader

If I am understanding the change correctly, it could be titled "Fix reading ORC files after column evolved from tinyint to integer"

Am i reading this right?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for my misleading commit tile. Updated it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for changing the commit title.

Is TestHiveTransactionalTable.java the only test coverage we have for this "Fix reading ORC files after column evolved from tinyint to integer" change?

}
return new ByteColumnReader(type, column, memoryContext.newLocalMemoryContext(ColumnReaders.class.getSimpleName()));
case SHORT:
case INT:
Expand Down
2 changes: 2 additions & 0 deletions plugin/trino-hive/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@
<configuration>
<excludes>
<exclude>**/TestHiveGlueMetastore.java</exclude>
<exclude>**/TestHiveGlueMetastoreCompatibility.java</exclude>
<exclude>**/TestTrinoS3FileSystemAwsS3.java</exclude>
<exclude>**/TestFullParquetReader.java</exclude>
<exclude>**/Test*FailureRecoveryTest.java</exclude>
Expand Down Expand Up @@ -593,6 +594,7 @@
<configuration>
<includes>
<include>**/TestHiveGlueMetastore.java</include>
<include>**/TestHiveGlueMetastoreCompatibility.java</include>
<include>**/TestTrinoS3FileSystemAwsS3.java</include>
</includes>
</configuration>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.collect.Streams;
import io.airlift.json.JsonCodec;
import io.airlift.log.Logger;
import io.airlift.slice.Slice;
Expand Down Expand Up @@ -119,11 +121,14 @@
import io.trino.spi.statistics.TableStatistics;
import io.trino.spi.statistics.TableStatisticsMetadata;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.CharType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeManager;
import io.trino.spi.type.VarcharType;
import org.apache.avro.Schema;
import org.apache.avro.SchemaParseException;
import org.apache.hadoop.fs.FileSystem;
Expand Down Expand Up @@ -167,6 +172,7 @@
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.Iterables.concat;
import static com.google.common.collect.Iterables.getOnlyElement;
import static com.google.common.collect.MoreCollectors.toOptional;
import static io.trino.hdfs.ConfigurationUtils.toJobConf;
import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext;
import static io.trino.plugin.hive.HiveAnalyzeProperties.getColumnNames;
Expand Down Expand Up @@ -326,6 +332,11 @@
import static io.trino.spi.predicate.TupleDomain.withColumnDomains;
import static io.trino.spi.statistics.TableStatisticType.ROW_COUNT;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.TypeUtils.isFloatingPointNaN;
import static io.trino.spi.type.VarcharType.createUnboundedVarcharType;
import static java.lang.Boolean.parseBoolean;
Expand Down Expand Up @@ -1345,6 +1356,133 @@ public void dropColumn(ConnectorSession session, ConnectorTableHandle tableHandl
metastore.dropColumn(hiveTableHandle.getSchemaName(), hiveTableHandle.getTableName(), columnHandle.getName());
}

@Override
public void setColumnType(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle columnHandle, Type type)
{
HiveTableHandle table = (HiveTableHandle) tableHandle;
failIfAvroSchemaIsSet(table);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should fail for CSV files too (csv is all varchar)

HiveColumnHandle hiveColumn = (HiveColumnHandle) columnHandle;

HiveStorageFormat storageFormat = extractHiveStorageFormat(metastore.getTable(table.getSchemaName(), table.getTableName())
.orElseThrow(() -> new TableNotFoundException(table.getSchemaTableName())));
verifyStorageFormatForColumnTypeChange(storageFormat);
verifySupportedColumnTypeChange(hiveColumn.getType(), type);
if (table.getPartitionColumns().stream().anyMatch(partition -> partition.getName().equals(hiveColumn.getName()))) {
throw new TrinoException(NOT_SUPPORTED, "Changing partition column types is not supported");
}
ImmutableList.Builder<Partition> partitionsBuilder = ImmutableList.builder();
List<String> partitionsNames = metastore.getPartitionNames(table.getSchemaName(), table.getTableName())
.orElseThrow(() -> new TableNotFoundException(table.getSchemaTableName()));
for (List<String> partitionsNamesBatch : Lists.partition(partitionsNames, 1000)) {
metastore.getPartitionsByNames(table.getSchemaName(), table.getTableName(), partitionsNamesBatch).values().stream()
.filter(Optional::isPresent).map(Optional::get)
.forEach(partition -> {
boolean skipUpdatePartition = false;
verifyStorageFormatForColumnTypeChange(extractHiveStorageFormat(partition.getStorage().getStorageFormat()));
ImmutableList.Builder<Column> columns = ImmutableList.builder();
for (Column column : partition.getColumns()) {
if (column.getName().equals(hiveColumn.getName())) {
Type sourceType = column.getType().getType(typeManager, getTimestampPrecision(session));
verifySupportedColumnTypeChange(sourceType, type);
columns.add(column);
skipUpdatePartition = sourceType.equals(type);
}
else {
columns.add(column);
}
}
if (!skipUpdatePartition) {
partitionsBuilder.add(Partition.builder(partition).setColumns(columns.build()).build());
}
});
}
List<Partition> partitions = partitionsBuilder.build();
metastore.setColumnType(table.getSchemaName(), table.getTableName(), hiveColumn.getName(), toHiveType(type));
if (!partitions.isEmpty()) {
// Hive changes a column definition in each partitions unless the ALTER TABLE statement doesn't contain partition condition
// Trino doesn't support specifying partitions in ALTER TABLE, so SET DATA TYPE updates all partitions
// https://cwiki.apache.org/confluence/display/hive/languagemanual+ddl#LanguageManualDDL-AlterPartition
metastore.alterPartitions(table.getSchemaName(), table.getTableName(), partitions, OptionalLong.empty());
Comment on lines +1402 to +1405
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised

Firstl, i wouldn't expect existing partitions to be updated at all.

Second, it will make it impossible to update type in large tables.

Let's talk more about this

}
}

private void verifyStorageFormatForColumnTypeChange(HiveStorageFormat storageFormat)
{
// TODO: Support other storage format except for CSV and REGEX
// AVRO leads to query failure after converting varchar to char(20)
// RCBINARY leads to query failure and incorrect results
// RCTEXT returns incorrect results on row types
// SEQUENCEFILE returns incorrect results on row types
// JSON leads to query failure for NaN after changing real to double type
// TEXTFILE returns incorrect results on row types
if (storageFormat != HiveStorageFormat.ORC && storageFormat != HiveStorageFormat.PARQUET) {
throw new TrinoException(NOT_SUPPORTED, "Unsupported storage format for changing column type: " + storageFormat);
Comment on lines 1418 to 1419
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the check effective?
can a partitioned table have partitions with different file format?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw it's looks like we should be able to lift the limitation for eg TEXTFILE easily.
please add a code comment explaining the logic, why ORC/PARQUET only and what is our stance about partitioned tables.

}
}

private void verifySupportedColumnTypeChange(Type sourceType, Type targetType)
{
if (!canChangeColumnType(sourceType, targetType)) {
throw new TrinoException(NOT_SUPPORTED, "Cannot change type from %s to %s".formatted(sourceType, targetType));
}
}

private static boolean canChangeColumnType(Type sourceType, Type targetType)
{
if (sourceType.equals(targetType)) {
return true;
}
if (sourceType == TINYINT) {
return targetType == SMALLINT || targetType == INTEGER || targetType == BIGINT;
}
if (sourceType == SMALLINT) {
return targetType == INTEGER || targetType == BIGINT;
}
if (sourceType == INTEGER) {
return targetType == BIGINT;
}
if (sourceType == REAL) {
return targetType == DOUBLE;
}
if (sourceType instanceof VarcharType || sourceType instanceof CharType) {
// Truncation characters is supported
return targetType instanceof VarcharType || targetType instanceof CharType;
}
if (sourceType instanceof DecimalType sourceDecimal && targetType instanceof DecimalType targetDecimal) {
// TODO: Support rescale in ORC DecimalColumnReader
return sourceDecimal.getScale() == targetDecimal.getScale()
&& sourceDecimal.getPrecision() <= targetDecimal.getPrecision();
}
if (sourceType instanceof ArrayType sourceArrayType && targetType instanceof ArrayType targetArrayType) {
return canChangeColumnType(sourceArrayType.getElementType(), targetArrayType.getElementType());
}
if (sourceType instanceof RowType sourceRowType && targetType instanceof RowType targetRowType) {
List<RowType.Field> fields = Streams.concat(sourceRowType.getFields().stream(), targetRowType.getFields().stream())
.distinct()
.collect(toImmutableList());
for (RowType.Field field : fields) {
String fieldName = field.getName().orElseThrow();
boolean allowedChange = findFieldByName(sourceRowType.getFields(), fieldName)
.flatMap(sourceField -> findFieldByName(targetRowType.getFields(), fieldName)
.map(targetField -> canChangeColumnType(sourceField.getType(), targetField.getType())))
// Allow removal and addition of fields as the connector supports dropping and re-adding a column
.orElse(true);
if (!allowedChange) {
return false;
}
}
return true;
}
return false;
}

private static Optional<RowType.Field> findFieldByName(List<RowType.Field> fields, String fieldName)
{
return fields.stream()
.filter(field -> field.getName().orElseThrow().equals(fieldName))
.collect(toOptional());
}

@Override
public void setTableAuthorization(ConnectorSession session, SchemaTableName table, TrinoPrincipal principal)
{
Expand Down Expand Up @@ -3463,7 +3601,11 @@ public List<GrantInfo> listTablePrivileges(ConnectorSession session, SchemaTable

private static HiveStorageFormat extractHiveStorageFormat(Table table)
{
StorageFormat storageFormat = table.getStorage().getStorageFormat();
return extractHiveStorageFormat(table.getStorage().getStorageFormat());
}

private static HiveStorageFormat extractHiveStorageFormat(StorageFormat storageFormat)
{
String outputFormat = storageFormat.getOutputFormat();
String serde = storageFormat.getSerde();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ public void dropColumn(String databaseName, String tableName, String columnName)
delegate.dropColumn(databaseName, tableName, columnName);
}

public void setColumnType(String databaseName, String tableName, String columnName, HiveType columnType)
{
delegate.setColumnType(databaseName, tableName, columnName, columnType);
}

public Optional<Partition> getPartition(String databaseName, String tableName, List<String> partitionValues)
{
return delegate.getTable(databaseName, tableName)
Expand Down Expand Up @@ -373,7 +378,7 @@ public void updateTableWriteId(String dbName, String tableName, long transaction
delegate.updateTableWriteId(dbName, tableName, transactionId, writeId, rowCountChange);
}

public void alterPartitions(String dbName, String tableName, List<Partition> partitions, long writeId)
public void alterPartitions(String dbName, String tableName, List<Partition> partitions, OptionalLong writeId)
{
delegate.alterPartitions(dbName, tableName, partitions, writeId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ public void dropColumn(String databaseName, String tableName, String columnName)
delegate.dropColumn(databaseName, tableName, columnName);
}

@Override
public void setColumnType(String databaseName, String tableName, String columnName, HiveType columnType)
{
delegate.setColumnType(databaseName, tableName, columnName, columnType);
}

@Override
public Optional<Partition> getPartition(Table table, List<String> partitionValues)
{
Expand Down Expand Up @@ -427,7 +433,7 @@ public void alterPartitions(
String dbName,
String tableName,
List<Partition> partitions,
long writeId)
OptionalLong writeId)
{
delegate.alterPartitions(dbName, tableName, partitions, writeId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ default void updatePartitionStatistics(Table table, String partitionName, Functi

void dropColumn(String databaseName, String tableName, String columnName);

void setColumnType(String databaseName, String tableName, String columnName, HiveType columnType);

Optional<Partition> getPartition(Table table, List<String> partitionValues);

/**
Expand Down Expand Up @@ -211,7 +213,7 @@ default void updateTableWriteId(String dbName, String tableName, long transactio
throw new UnsupportedOperationException();
}

default void alterPartitions(String dbName, String tableName, List<Partition> partitions, long writeId)
default void alterPartitions(String dbName, String tableName, List<Partition> partitions, OptionalLong writeId)
{
throw new UnsupportedOperationException();
}
Expand Down
Loading