Skip to content

Commit 9c9e951

Browse files
groupcache4321yluan
authored andcommitted
Extract dereferenceSubFieldTypes() from ParquetPageSourceFactory
1 parent 5150eb5 commit 9c9e951

File tree

1 file changed

+45
-20
lines changed

1 file changed

+45
-20
lines changed

plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import io.trino.plugin.hive.AcidInfo;
3434
import io.trino.plugin.hive.FileFormatDataSourceStats;
3535
import io.trino.plugin.hive.HiveColumnHandle;
36+
import io.trino.plugin.hive.HiveColumnProjectionInfo;
3637
import io.trino.plugin.hive.HiveConfig;
3738
import io.trino.plugin.hive.HivePageSourceFactory;
3839
import io.trino.plugin.hive.HiveType;
@@ -331,30 +332,26 @@ public static Optional<MessageType> getParquetMessageType(List<HiveColumnHandle>
331332

332333
public static Optional<org.apache.parquet.schema.Type> getColumnType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames)
333334
{
334-
Optional<org.apache.parquet.schema.Type> columnType = getBaseColumnParquetType(column, messageType, useParquetColumnNames);
335-
if (columnType.isEmpty() || column.getHiveColumnProjectionInfo().isEmpty()) {
336-
return columnType;
335+
Optional<org.apache.parquet.schema.Type> baseColumnType = getBaseColumnParquetType(column, messageType, useParquetColumnNames);
336+
if (baseColumnType.isEmpty() || column.getHiveColumnProjectionInfo().isEmpty()) {
337+
return baseColumnType;
337338
}
338-
GroupType baseType = columnType.get().asGroupType();
339-
ImmutableList.Builder<org.apache.parquet.schema.Type> typeBuilder = ImmutableList.builder();
340-
org.apache.parquet.schema.Type parentType = baseType;
339+
GroupType baseType = baseColumnType.get().asGroupType();
340+
Optional<List<org.apache.parquet.schema.Type>> subFieldTypesOptional = dereferenceSubFieldTypes(baseType, column.getHiveColumnProjectionInfo().get());
341341

342-
for (String name : column.getHiveColumnProjectionInfo().get().getDereferenceNames()) {
343-
org.apache.parquet.schema.Type childType = getParquetTypeByName(name, parentType.asGroupType());
344-
if (childType == null) {
345-
return Optional.empty();
346-
}
347-
typeBuilder.add(childType);
348-
parentType = childType;
342+
// if there is a mismatch between parquet schema and the hive schema and the column cannot be dereferenced
343+
if (subFieldTypesOptional.isEmpty()) {
344+
return Optional.empty();
349345
}
350-
351-
List<org.apache.parquet.schema.Type> subfieldTypes = typeBuilder.build();
352-
org.apache.parquet.schema.Type type = subfieldTypes.get(subfieldTypes.size() - 1);
353-
for (int i = subfieldTypes.size() - 2; i >= 0; --i) {
354-
GroupType groupType = subfieldTypes.get(i).asGroupType();
355-
type = new GroupType(groupType.getRepetition(), groupType.getName(), ImmutableList.of(type));
346+
else {
347+
List<org.apache.parquet.schema.Type> subfieldTypes = subFieldTypesOptional.get();
348+
org.apache.parquet.schema.Type type = subfieldTypes.get(subfieldTypes.size() - 1);
349+
for (int i = subfieldTypes.size() - 2; i >= 0; --i) {
350+
GroupType groupType = subfieldTypes.get(i).asGroupType();
351+
type = new GroupType(groupType.getRepetition(), groupType.getName(), ImmutableList.of(type));
352+
}
353+
return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type)));
356354
}
357-
return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type)));
358355
}
359356

360357
public static Optional<ColumnIndexStore> getColumnIndexStore(
@@ -509,4 +506,32 @@ private static Optional<org.apache.parquet.schema.Type> getBaseColumnParquetType
509506

510507
return Optional.empty();
511508
}
509+
510+
/**
511+
* Dereferencing base parquet type based on projection info's dereference names.
512+
* For example, when dereferencing baseType(level1Field0, level1Field1, Level1Field2(Level2Field0, Level2Field1))
513+
* with a projection info's dereferenceNames list as (basetype, Level1Field2, Level2Field1).
514+
* It would return a list of parquet types in the order of (level1Field2, Level2Field1)
515+
*
516+
* @return child fields on each level of dereferencing. Return Optional.empty when failed to do the lookup.
517+
*/
518+
private static Optional<List<org.apache.parquet.schema.Type>> dereferenceSubFieldTypes(GroupType baseType, HiveColumnProjectionInfo projectionInfo)
519+
{
520+
checkArgument(baseType != null, "base type cannot be null when dereferencing");
521+
checkArgument(projectionInfo != null, "hive column projection info cannot be null when doing dereferencing");
522+
523+
ImmutableList.Builder<org.apache.parquet.schema.Type> typeBuilder = ImmutableList.builder();
524+
org.apache.parquet.schema.Type parentType = baseType;
525+
526+
for (String name : projectionInfo.getDereferenceNames()) {
527+
org.apache.parquet.schema.Type childType = getParquetTypeByName(name, parentType.asGroupType());
528+
if (childType == null) {
529+
return Optional.empty();
530+
}
531+
typeBuilder.add(childType);
532+
parentType = childType;
533+
}
534+
535+
return Optional.of(typeBuilder.build());
536+
}
512537
}

0 commit comments

Comments
 (0)