diff --git a/hive/src/main/java/org/apache/iceberg/hive/legacy/HiveTypeUtil.java b/hive/src/main/java/org/apache/iceberg/hive/legacy/HiveTypeUtil.java index a7b4848b4f..8d15d4d38e 100644 --- a/hive/src/main/java/org/apache/iceberg/hive/legacy/HiveTypeUtil.java +++ b/hive/src/main/java/org/apache/iceberg/hive/legacy/HiveTypeUtil.java @@ -26,11 +26,17 @@ import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.iceberg.types.Type; + public class HiveTypeUtil { private HiveTypeUtil() { } + public static Type convert(TypeInfo typeInfo) { + return HiveTypeUtil.visit(typeInfo, new HiveTypeToIcebergType()); + } + public static T visit(TypeInfo typeInfo, HiveSchemaVisitor visitor) { switch (typeInfo.getCategory()) { case STRUCT: diff --git a/hive/src/main/java/org/apache/iceberg/hive/legacy/LegacyHiveTableUtils.java b/hive/src/main/java/org/apache/iceberg/hive/legacy/LegacyHiveTableUtils.java index 382cb3a922..cbf02612d0 100644 --- a/hive/src/main/java/org/apache/iceberg/hive/legacy/LegacyHiveTableUtils.java +++ b/hive/src/main/java/org/apache/iceberg/hive/legacy/LegacyHiveTableUtils.java @@ -19,6 +19,7 @@ package org.apache.iceberg.hive.legacy; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.util.HashMap; import java.util.List; @@ -28,7 +29,9 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.iceberg.FileFormat; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -51,14 +54,17 @@ private LegacyHiveTableUtils() { static Schema getSchema(org.apache.hadoop.hive.metastore.api.Table table) { Map props = getTableProperties(table); String schemaStr = props.get("avro.schema.literal"); - if (schemaStr == null) { - LOG.warn("Table {}.{} does not have an avro.schema.literal set; using Hive schema instead. The schema will not" + - " have case sensitivity and nullability information", table.getDbName(), table.getTableName()); - // TODO: Add support for tables without avro.schema.literal - throw new UnsupportedOperationException("Reading tables without avro.schema.literal not implemented yet"); + Schema schema; + if (schemaStr != null) { + schema = AvroSchemaUtil.toIceberg(new org.apache.avro.Schema.Parser().parse(schemaStr)); + } else { + //TODO: Do we need to support column and column.types properties for ORC tables? + LOG.warn("Table {}.{} does not have an avro.schema.literal set; using Hive schema instead. " + + "The schema will not have case sensitivity and nullability information", + table.getDbName(), table.getTableName()); + Type icebergType = HiveTypeUtil.convert(parseTypeInfo(table.getSd().getCols())); + schema = new Schema(icebergType.asNestedType().asStructType().fields()); } - - Schema schema = AvroSchemaUtil.toIceberg(new org.apache.avro.Schema.Parser().parse(schemaStr)); Types.StructType dataStructType = schema.asStruct(); List fields = Lists.newArrayList(dataStructType.fields()); @@ -68,6 +74,19 @@ static Schema getSchema(org.apache.hadoop.hive.metastore.api.Table table) { return new Schema(fields); } + private static TypeInfo parseTypeInfo(List cols) { + Preconditions.checkArgument(cols != null && cols.size() > 0, "No Hive schema present"); + List fieldNames = cols + .stream() + .map(FieldSchema::getName) + .collect(Collectors.toList()); + List fieldTypeInfos = cols + .stream() + .map(f -> TypeInfoUtils.getTypeInfoFromTypeString(f.getType())) + .collect(Collectors.toList()); + return TypeInfoFactory.getStructTypeInfo(fieldNames, fieldTypeInfos); + } + private static Schema partitionSchema(List partitionKeys, Schema dataSchema) { AtomicInteger fieldId = new AtomicInteger(10000); List partitionFields = Lists.newArrayList(); @@ -78,14 +97,14 @@ private static Schema partitionSchema(List partitionKeys, Schema da } partitionFields.add( Types.NestedField.optional( - fieldId.incrementAndGet(), f.getName(), icebergType(f.getType()), f.getComment())); + fieldId.incrementAndGet(), f.getName(), primitiveIcebergType(f.getType()), f.getComment())); }); return new Schema(partitionFields); } - private static Type icebergType(String hiveTypeString) { + private static Type primitiveIcebergType(String hiveTypeString) { PrimitiveTypeInfo primitiveTypeInfo = TypeInfoFactory.getPrimitiveTypeInfo(hiveTypeString); - return HiveTypeUtil.visit(primitiveTypeInfo, new HiveTypeToIcebergType()); + return HiveTypeUtil.convert(primitiveTypeInfo); } static Map getTableProperties(org.apache.hadoop.hive.metastore.api.Table table) { diff --git a/hive/src/test/java/org/apache/iceberg/hive/legacy/TestHiveSchemaConversions.java b/hive/src/test/java/org/apache/iceberg/hive/legacy/TestHiveSchemaConversions.java new file mode 100644 index 0000000000..1f481e3ffd --- /dev/null +++ b/hive/src/test/java/org/apache/iceberg/hive/legacy/TestHiveSchemaConversions.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.hive.legacy; + +import com.google.common.collect.Lists; +import java.util.List; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + + +public class TestHiveSchemaConversions { + @Test + public void testPrimitiveTypes() { + List primitives = Lists.newArrayList( + Types.BooleanType.get(), + Types.IntegerType.get(), + Types.LongType.get(), + Types.FloatType.get(), + Types.DoubleType.get(), + Types.DateType.get(), + Types.TimestampType.withoutZone(), + Types.StringType.get(), + Types.BinaryType.get(), + Types.DecimalType.of(9, 4) + ); + + List hivePrimitives = Lists.newArrayList( + TypeInfoFactory.booleanTypeInfo, + TypeInfoFactory.intTypeInfo, + TypeInfoFactory.longTypeInfo, + TypeInfoFactory.floatTypeInfo, + TypeInfoFactory.doubleTypeInfo, + TypeInfoFactory.dateTypeInfo, + TypeInfoFactory.timestampTypeInfo, + TypeInfoFactory.stringTypeInfo, + TypeInfoFactory.binaryTypeInfo, + TypeInfoFactory.getDecimalTypeInfo(9, 4) + ); + + for (int i = 0; i < primitives.size(); i += 1) { + Type icebergType = primitives.get(i); + PrimitiveTypeInfo hiveType = hivePrimitives.get(i); + Assert.assertEquals("Hive schema to primitive: " + hiveType, icebergType, HiveTypeUtil.convert(hiveType)); + } + } + + @Test + public void testConversions() { + check("struct<1: a: optional int, 2: b: optional string>", "struct"); + check("struct<5: a: optional map>>", "struct>>"); + check("struct<2: l1: optional list>", "struct>"); + check("struct<3: l1: optional list>>", "struct>>"); + check("list>>>>", "array>>>>"); + check("struct<" + + "6: length: optional int, 7: count: optional int, " + + "8: list: optional list>, " + + "9: wordcounts: optional map>", + "struct<" + + "length:int,count:int,list:array>," + + "wordcounts:map>"); + } + + private static void check(String icebergTypeStr, String hiveTypeStr) { + Type icebergType = HiveTypeUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(hiveTypeStr)); + Assert.assertEquals(icebergTypeStr, icebergType.toString()); + } +} diff --git a/hive/src/test/java/org/apache/iceberg/hive/legacy/TestLegacyHiveTableScan.java b/hive/src/test/java/org/apache/iceberg/hive/legacy/TestLegacyHiveTableScan.java index f0d9f1a7cb..d6a22bfb76 100644 --- a/hive/src/test/java/org/apache/iceberg/hive/legacy/TestLegacyHiveTableScan.java +++ b/hive/src/test/java/org/apache/iceberg/hive/legacy/TestLegacyHiveTableScan.java @@ -109,6 +109,15 @@ public void testHiveScanMultiPartition() throws Exception { filesMatch(ImmutableMap.of("pcol=ds/pIntCol=2/B", AVRO, "pcol=ds/pIntCol=1/A", AVRO), hiveScan(table)); } + @Test + public void testHiveScanNoAvroSchema() throws Exception { + String tableName = "hive_scan_no_avro_schema"; + Table table = createTable(tableName, DATA_COLUMNS, PARTITION_COLUMNS, ORC); + addPartition(table, ImmutableList.of("ds", 1), ORC, "A"); + addPartition(table, ImmutableList.of("ds", 2), ORC, "B"); + filesMatch(ImmutableMap.of("pcol=ds/pIntCol=2/B", ORC, "pcol=ds/pIntCol=1/A", ORC), hiveScan(table)); + } + @Test public void testHiveScanMultiPartitionWithFilter() throws Exception { String tableName = "multi_partition_with_filter"; @@ -152,6 +161,12 @@ public void testHiveScanHybridTable() throws Exception { private static Table createTable(String tableName, List columns, List partitionColumns) throws Exception { + return createTable(tableName, columns, partitionColumns, AVRO); + } + + private static Table createTable( + String tableName, List columns, List partitionColumns, FileFormat format) + throws Exception { long currentTimeMillis = System.currentTimeMillis(); Path tableLocation = dbPath.resolve(tableName); Files.createDirectories(tableLocation); @@ -161,7 +176,7 @@ private static Table createTable(String tableName, List columns, Li (int) currentTimeMillis / 1000, (int) currentTimeMillis / 1000, Integer.MAX_VALUE, - storageDescriptor(columns, tableLocation.toString(), AVRO), + storageDescriptor(columns, tableLocation.toString(), format), partitionColumns, new HashMap<>(), null, @@ -182,6 +197,8 @@ private static StorageDescriptor storageDescriptor(List columns, St storageDescriptor.setOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"); storageDescriptor.setInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"); serDeInfo.setSerializationLib("org.apache.hadoop.hive.serde2.avro.AvroSerDe"); + storageDescriptor.setParameters(ImmutableMap.of( + AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), schemaLiteral(columns))); break; case ORC: storageDescriptor.setOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"); @@ -192,8 +209,6 @@ private static StorageDescriptor storageDescriptor(List columns, St throw new UnsupportedOperationException("Unsupported file format: " + format); } storageDescriptor.setSerdeInfo(serDeInfo); - storageDescriptor.setParameters(ImmutableMap.of( - AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), schemaLiteral(columns))); return storageDescriptor; }