From b007ce013b5516c0ad3647bc994cef0884a7e73e Mon Sep 17 00:00:00 2001 From: mchades Date: Tue, 28 Oct 2025 23:51:12 +0800 Subject: [PATCH 1/2] support Gravitino column type to Lance type --- .../lance/LanceCatalogOperations.java | 37 +-- .../lance/LanceDataTypeConverter.java | 246 ++++++++--------- .../lance/TestLanceDataTypeConverter.java | 261 ++++++++++++++++++ docs/generic-lakehouse-catalog.md | 142 ++++++++++ 4 files changed, 528 insertions(+), 158 deletions(-) create mode 100644 catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java create mode 100644 docs/generic-lakehouse-catalog.md diff --git a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java index dcfe6bd4896..9572c656d23 100644 --- a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java +++ b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceCatalogOperations.java @@ -38,7 +38,6 @@ import java.util.Optional; import java.util.stream.Collectors; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.commons.lang3.ArrayUtils; import org.apache.gravitino.Catalog; @@ -129,7 +128,7 @@ public Table createTable( Dataset.create( new RootAllocator(), location, - convertColumnsToSchema(columns), + convertColumnsToArrowSchema(columns), new WriteParams.Builder().withStorageOptions(storageProps).build())) { GenericLakehouseTable.Builder builder = GenericLakehouseTable.builder(); return builder @@ -151,39 +150,13 @@ public Table createTable( } } - private org.apache.arrow.vector.types.pojo.Schema convertColumnsToSchema(Column[] columns) { - LanceDataTypeConverter converter = new LanceDataTypeConverter(); + private org.apache.arrow.vector.types.pojo.Schema convertColumnsToArrowSchema(Column[] columns) { List fields = Arrays.stream(columns) .map( - col -> { - boolean nullable = col.nullable(); - ArrowType parentType = converter.fromGravitino(col.dataType()); - List childTypes = converter.getChildTypes(col.dataType()); - List childFields = - childTypes.stream() - .map( - childType -> - new org.apache.arrow.vector.types.pojo.Field( - "", - org.apache.arrow.vector.types.pojo.FieldType.nullable( - childType), - null)) - .collect(Collectors.toList()); - - if (nullable) { - return new org.apache.arrow.vector.types.pojo.Field( - col.name(), - org.apache.arrow.vector.types.pojo.FieldType.nullable(parentType), - childFields); - } - - // not nullable - return new org.apache.arrow.vector.types.pojo.Field( - col.name(), - org.apache.arrow.vector.types.pojo.FieldType.notNullable(parentType), - childFields); - }) + col -> + LanceDataTypeConverter.CONVERTER.toArrowField( + col.name(), col.dataType(), col.nullable())) .collect(Collectors.toList()); return new org.apache.arrow.vector.types.pojo.Schema(fields); } diff --git a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java index d7966edd5ee..ad07a586ba5 100644 --- a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java +++ b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java @@ -19,82 +19,159 @@ package org.apache.gravitino.catalog.lakehouse.lance; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; -import java.util.List; +import java.util.Arrays; +import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.IntervalUnit; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.gravitino.connector.DataTypeConverter; import org.apache.gravitino.json.JsonUtils; import org.apache.gravitino.rel.types.Type; import org.apache.gravitino.rel.types.Types; import org.apache.gravitino.rel.types.Types.FixedType; -import org.apache.gravitino.rel.types.Types.UnparsedType; public class LanceDataTypeConverter implements DataTypeConverter { + public static final LanceDataTypeConverter CONVERTER = new LanceDataTypeConverter(); + + public Field toArrowField(String name, Type type, boolean nullable) { + switch (type.name()) { + case LIST: + Types.ListType listType = (Types.ListType) type; + FieldType listField = new FieldType(nullable, ArrowType.List.INSTANCE, null); + return new Field( + name, + listField, + Lists.newArrayList( + toArrowField("element", listType.elementType(), listType.elementNullable()))); + + case STRUCT: + Types.StructType structType = (Types.StructType) type; + FieldType structField = new FieldType(nullable, ArrowType.Struct.INSTANCE, null); + return new Field( + name, + structField, + Arrays.stream(structType.fields()) + .map(field -> toArrowField(field.name(), field.type(), field.nullable())) + .toList()); + + case MAP: + Types.MapType mapType = (Types.MapType) type; + FieldType mapField = new FieldType(nullable, new ArrowType.Map(false), null); + return new Field( + name, + mapField, + Lists.newArrayList( + toArrowField( + MapVector.DATA_VECTOR_NAME, + Types.StructType.of( + Types.StructType.Field.of( + // Note: Arrow MapVector requires key field to be non-nullable + MapVector.KEY_NAME, + mapType.keyType(), + false /*nullable*/, + null /*comment*/), + Types.StructType.Field.of( + MapVector.VALUE_NAME, + mapType.valueType(), + mapType.valueNullable(), + null)), + false /*nullable*/))); + case EXTERNAL: + Types.ExternalType externalType = (Types.ExternalType) type; + Field field; + try { + field = JsonUtils.anyFieldMapper().readValue(externalType.catalogString(), Field.class); + } catch (JsonProcessingException e) { + throw new RuntimeException( + "Failed to parse external type catalog string: " + externalType.catalogString(), e); + } + Preconditions.checkArgument( + name.equals(field.getName()), + "expected field name %s but got %s", + name, + field.getName()); + Preconditions.checkArgument( + nullable == field.isNullable(), + "expected field nullable %s but got %s", + nullable, + field.isNullable()); + return field; + + default: + // non-complex type + FieldType fieldType = new FieldType(nullable, fromGravitino(type), null); + return new Field(name, fieldType, null); + } + } + @Override public ArrowType fromGravitino(Type type) { switch (type.name()) { case BOOLEAN: return Bool.INSTANCE; case BYTE: - return new Int(8, true); + return new Int(8, ((Types.ByteType) type).signed()); case SHORT: - return new Int(16, true); + return new Int(8 * 2, ((Types.ShortType) type).signed()); case INTEGER: - return new Int(32, true); + return new Int(8 * 4, ((Types.IntegerType) type).signed()); case LONG: - return new Int(64, true); + return new Int(8 * 8, ((Types.LongType) type).signed()); case FLOAT: return new FloatingPoint(FloatingPointPrecision.SINGLE); case DOUBLE: return new FloatingPoint(FloatingPointPrecision.DOUBLE); + case STRING: + return ArrowType.Utf8.INSTANCE; + case BINARY: + return ArrowType.Binary.INSTANCE; case DECIMAL: - // Lance uses FIXED_SIZE_BINARY for decimal types - return new ArrowType.FixedSizeBinary(16); // assuming 16 bytes for decimal + Types.DecimalType decimalType = (Types.DecimalType) type; + return new ArrowType.Decimal(decimalType.precision(), decimalType.scale(), 8 * 16); case DATE: return new ArrowType.Date(DateUnit.DAY); - case TIME: - return new ArrowType.Time(TimeUnit.MILLISECOND, 32); case TIMESTAMP: - return new ArrowType.Timestamp(TimeUnit.MILLISECOND, null); - case VARCHAR: - case STRING: - return new ArrowType.Utf8(); + Types.TimestampType timestampType = (Types.TimestampType) type; + TimeUnit timeUnit = TimeUnit.MICROSECOND; + if (timestampType.hasPrecisionSet()) { + timeUnit = + switch (timestampType.precision()) { + case 0 -> TimeUnit.SECOND; + case 3 -> TimeUnit.MILLISECOND; + case 6 -> TimeUnit.MICROSECOND; + case 9 -> TimeUnit.NANOSECOND; + default -> throw new UnsupportedOperationException( + "Expected precision to be one of 0, 3, 6, 9 but got: " + + timestampType.precision()); + }; + } + if (timestampType.hasTimeZone()) { + // todo: need timeZoneId for timestamp with time zone + return new ArrowType.Timestamp(timeUnit, "UTC"); + } + return new ArrowType.Timestamp(timeUnit, null); + case TIME: + return new ArrowType.Time(TimeUnit.NANOSECOND, 8 * 8); + case NULL: + return ArrowType.Null.INSTANCE; + case INTERVAL_YEAR: + return new ArrowType.Interval(IntervalUnit.YEAR_MONTH); + case INTERVAL_DAY: + return new ArrowType.Duration(TimeUnit.MICROSECOND); case FIXED: FixedType fixedType = (FixedType) type; return new ArrowType.FixedSizeBinary(fixedType.length()); - case BINARY: - return new ArrowType.Binary(); - case UNPARSED: - String typeStr = ((UnparsedType) type).unparsedType().toString(); - try { - Type t = JsonUtils.anyFieldMapper().readValue(typeStr, Type.class); - if (t instanceof Types.ListType) { - return ArrowType.List.INSTANCE; - } else if (t instanceof Types.MapType) { - return new ArrowType.Map(false); - } else if (t instanceof Types.StructType) { - return ArrowType.Struct.INSTANCE; - } else { - throw new UnsupportedOperationException( - "Unsupported UnparsedType conversion: " + t.simpleString()); - } - } catch (Exception e) { - // FixedSizeListArray(integer, 3) - if (typeStr.startsWith("FixedSizeListArray")) { - int size = - Integer.parseInt( - typeStr.substring(typeStr.indexOf(',') + 1, typeStr.indexOf(')')).trim()); - return new ArrowType.FixedSizeList(size); - } - throw new UnsupportedOperationException("Failed to parse UnparsedType: " + typeStr, e); - } default: throw new UnsupportedOperationException("Unsupported Gravitino type: " + type.name()); } @@ -102,91 +179,8 @@ public ArrowType fromGravitino(Type type) { @Override public Type toGravitino(ArrowType arrowType) { - if (arrowType instanceof Bool) { - return Types.BooleanType.get(); - } else if (arrowType instanceof Int intType) { - switch (intType.getBitWidth()) { - case 8 -> { - return Types.ByteType.get(); - } - case 16 -> { - return Types.ShortType.get(); - } - case 32 -> { - return Types.IntegerType.get(); - } - case 64 -> { - return Types.LongType.get(); - } - default -> throw new UnsupportedOperationException( - "Unsupported Int bit width: " + intType.getBitWidth()); - } - } else if (arrowType instanceof FloatingPoint floatingPoint) { - switch (floatingPoint.getPrecision()) { - case SINGLE: - return Types.FloatType.get(); - case DOUBLE: - return Types.DoubleType.get(); - default: - throw new UnsupportedOperationException( - "Unsupported FloatingPoint precision: " + floatingPoint.getPrecision()); - } - } else if (arrowType instanceof ArrowType.FixedSizeBinary) { - ArrowType.FixedSizeBinary fixedSizeBinary = (ArrowType.FixedSizeBinary) arrowType; - return Types.FixedType.of(fixedSizeBinary.getByteWidth()); - } else if (arrowType instanceof ArrowType.Date) { - return Types.DateType.get(); - } else if (arrowType instanceof ArrowType.Time) { - return Types.TimeType.get(); - } else if (arrowType instanceof ArrowType.Timestamp) { - return Types.TimestampType.withoutTimeZone(); - } else if (arrowType instanceof ArrowType.Utf8) { - return Types.StringType.get(); - } else if (arrowType instanceof ArrowType.Binary) { - return Types.BinaryType.get(); - // TODO handle complex types like List, Map, Struct - } else { - throw new UnsupportedOperationException("Unsupported Arrow type: " + arrowType); - } - } - - public List getChildTypes(Type parentType) { - if (parentType.name() != Type.Name.UNPARSED) { - return List.of(); - } - - List arrowTypes = Lists.newArrayList(); - String typeStr = ((UnparsedType) parentType).unparsedType().toString(); - try { - Type t = JsonUtils.anyFieldMapper().readValue(typeStr, Type.class); - if (t instanceof Types.ListType listType) { - arrowTypes.add(fromGravitino(listType.elementType())); - } else if (t instanceof Types.MapType mapType) { - arrowTypes.add(fromGravitino(mapType.keyType())); - arrowTypes.add(fromGravitino(mapType.valueType())); - } else { - // TODO support struct type. - throw new UnsupportedOperationException( - "Unsupported UnparsedType conversion: " + t.simpleString()); - } - - return arrowTypes; - } catch (Exception e) { - // FixedSizeListArray(integer, 3) - - try { - if (typeStr.startsWith("FixedSizeListArray")) { - String type = typeStr.substring(typeStr.indexOf('(') + 1, typeStr.indexOf(',')).trim(); - Type childType = JsonUtils.anyFieldMapper().readValue("\"" + type + "\"", Type.class); - arrowTypes.add(fromGravitino(childType)); - - return arrowTypes; - } - } catch (Exception e1) { - throw new UnsupportedOperationException("Failed to parse UnparsedType: " + typeStr, e1); - } - - throw new UnsupportedOperationException("Failed to parse UnparsedType: " + typeStr, e); - } + // since the table metadata will load from Gravitino storage directly, we don't need to + // implement this method for now. + throw new UnsupportedOperationException("toGravitino is not implemented yet."); } } diff --git a/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java new file mode 100644 index 00000000000..f86499112d7 --- /dev/null +++ b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.catalog.lakehouse.lance; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.function.Consumer; +import java.util.stream.Stream; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.gravitino.rel.types.Type; +import org.apache.gravitino.rel.types.Types; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.MethodSource; + +public class TestLanceDataTypeConverter { + private static final LanceDataTypeConverter CONVERTER = LanceDataTypeConverter.CONVERTER; + + // Gravitino complex type definitions for testing + private static final Types.StructType SIMPLE_STRUCT = + Types.StructType.of( + Types.StructType.Field.of("id", Types.LongType.get(), false, null), + Types.StructType.Field.of("name", Types.StringType.get(), true, null)); + + private static final Types.StructType NESTED_STRUCT = + Types.StructType.of( + Types.StructType.Field.of("id", Types.LongType.get(), false, null), + Types.StructType.Field.of( + "address", + Types.StructType.of( + Types.StructType.Field.of("street", Types.StringType.get(), false, null), + Types.StructType.Field.of("city", Types.StringType.get(), false, null)), + true, + null)); + private static final String NESTED_STRUCT_JSON = + "{\"name\":\"person_nested_json\",\"nullable\":false,\"type\":{\"name\":\"struct\"},\"children\":[" + + "{\"name\":\"id\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":64,\"isSigned\":true},\"children\":[]}," + + "{\"name\":\"address\",\"nullable\":true,\"type\":{\"name\":\"struct\"},\"children\":[" + + "{\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"},\"children\":[]}," + + "{\"name\":\"city\",\"nullable\":false,\"type\":{\"name\":\"utf8\"},\"children\":[]}" + + "]}" + + "]}"; + + private static final Types.ListType LIST_OF_STRUCTS = + Types.ListType.of( + Types.StructType.of( + Types.StructType.Field.of("sku", Types.StringType.get(), false, null), + Types.StructType.Field.of("quantity", Types.IntegerType.get(), false, null)), + true); + + // Field validators for Arrow conversion tests + private static Consumer INT_VALIDATOR = + field -> assertInstanceOf(ArrowType.Int.class, field.getFieldType().getType()); + private static Consumer STRING_VALIDATOR = + field -> assertInstanceOf(ArrowType.Utf8.class, field.getFieldType().getType()); + private static Consumer LARGE_UTF8_VALIDATOR = + field -> assertInstanceOf(ArrowType.LargeUtf8.class, field.getFieldType().getType()); + private static Consumer BOOLEAN_VALIDATOR = + field -> assertInstanceOf(ArrowType.Bool.class, field.getFieldType().getType()); + private static Consumer DECIMAL_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.Decimal.class, field.getFieldType().getType()); + ArrowType.Decimal decimal = (ArrowType.Decimal) field.getFieldType().getType(); + + assertEquals(10, decimal.getPrecision()); + assertEquals(2, decimal.getScale()); + }; + private static Consumer LIST_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.List.class, field.getFieldType().getType()); + assertEquals(1, field.getChildren().size()); + + Field elementField = field.getChildren().get(0); + assertEquals("element", elementField.getName()); + assertTrue(elementField.isNullable()); + assertInstanceOf(ArrowType.Int.class, elementField.getFieldType().getType()); + }; + private static Consumer MAP_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.Map.class, field.getFieldType().getType()); + assertEquals(1, field.getChildren().size()); + + Field structField = field.getChildren().get(0); + assertEquals(MapVector.DATA_VECTOR_NAME, structField.getName()); + assertEquals(2, structField.getChildren().size()); + + Field keyField = structField.getChildren().get(0); + assertEquals(MapVector.KEY_NAME, keyField.getName()); + assertFalse(keyField.isNullable()); + assertInstanceOf(ArrowType.Utf8.class, keyField.getFieldType().getType()); + + Field valueField = structField.getChildren().get(1); + assertEquals(MapVector.VALUE_NAME, valueField.getName()); + assertTrue(valueField.isNullable()); + assertInstanceOf(ArrowType.Int.class, valueField.getFieldType().getType()); + }; + private static Consumer STRUCT_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.Struct.class, field.getFieldType().getType()); + assertEquals(2, field.getChildren().size()); + + Field idField = field.getChildren().get(0); + assertEquals("id", idField.getName()); + assertFalse(idField.isNullable()); + assertInstanceOf(ArrowType.Int.class, idField.getFieldType().getType()); + + Field nameField = field.getChildren().get(1); + assertEquals("name", nameField.getName()); + assertTrue(nameField.isNullable()); + assertInstanceOf(ArrowType.Utf8.class, nameField.getFieldType().getType()); + }; + private static Consumer NESTED_STRUCT_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.Struct.class, field.getFieldType().getType()); + assertEquals(2, field.getChildren().size()); + + Field addressField = field.getChildren().get(1); + assertEquals("address", addressField.getName()); + assertTrue(addressField.isNullable()); + + assertInstanceOf(ArrowType.Struct.class, addressField.getFieldType().getType()); + assertEquals(2, addressField.getChildren().size()); + }; + private static Consumer LIST_OF_STRUCTS_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.List.class, field.getFieldType().getType()); + assertEquals(1, field.getChildren().size()); + + Field elementField = field.getChildren().get(0); + assertEquals("element", elementField.getName()); + assertTrue(elementField.isNullable()); + assertInstanceOf(ArrowType.Struct.class, elementField.getFieldType().getType()); + assertEquals(2, elementField.getChildren().size()); + }; + + @ParameterizedTest + @DisplayName("Test conversion of Integer types (Byte, Short, Integer, Long)") + @CsvSource({"BYTE, 8, true", "SHORT, 16, true", "INTEGER, 32, true", "LONG, 64, true"}) + public void testFromGravitinoIntegerTypes( + String typeName, int expectedBitWidth, boolean expectedSigned) { + Type type = + switch (typeName) { + case "BYTE" -> Types.ByteType.get(); + case "SHORT" -> Types.ShortType.get(); + case "INTEGER" -> Types.IntegerType.get(); + case "LONG" -> Types.LongType.get(); + default -> throw new IllegalArgumentException("Unknown type: " + typeName); + }; + + ArrowType arrowType = CONVERTER.fromGravitino(type); + assertInstanceOf(ArrowType.Int.class, arrowType); + + ArrowType.Int intType = (ArrowType.Int) arrowType; + assertEquals(expectedBitWidth, intType.getBitWidth()); + assertEquals(expectedSigned, intType.getIsSigned()); + } + + @Test + public void testFromGravitinoTimestampWithTz() { + Types.TimestampType timestampType = Types.TimestampType.withTimeZone(); + ArrowType arrowType = CONVERTER.fromGravitino(timestampType); + assertInstanceOf(ArrowType.Timestamp.class, arrowType); + + ArrowType.Timestamp tsArrow = (ArrowType.Timestamp) arrowType; + assertEquals(TimeUnit.MICROSECOND, tsArrow.getUnit()); + assertEquals("UTC", tsArrow.getTimezone()); + } + + @ParameterizedTest(name = "[{index}] name={0}, type={1}, nullable={2}") + @MethodSource("toArrowFieldArguments") + @DisplayName("Test toArrowField for various types") + public void testToArrowField( + String name, Type gravitinoType, boolean nullable, Consumer validator) { + Field field = CONVERTER.toArrowField(name, gravitinoType, nullable); + + assertEquals(name, field.getName()); + assertEquals(nullable, field.isNullable()); + validator.accept(field); + } + + @Test + void testUnsupportedTypeThrowsException() { + Types.UnparsedType unparsedType = Types.UnparsedType.of("UNKNOWN_TYPE"); + assertThrows(UnsupportedOperationException.class, () -> CONVERTER.fromGravitino(unparsedType)); + } + + @Test + void testToGravitinoNotImplemented() { + assertThrows( + UnsupportedOperationException.class, () -> CONVERTER.toGravitino(ArrowType.Utf8.INSTANCE)); + } + + private static Stream toArrowFieldArguments() { + return Stream.of( + // Simple types + Arguments.of("age", Types.IntegerType.get(), true, INT_VALIDATOR), + Arguments.of("id", Types.LongType.get(), false, INT_VALIDATOR), + Arguments.of("name", Types.StringType.get(), true, STRING_VALIDATOR), + Arguments.of( + "description", + Types.ExternalType.of( + "{\n" + + " \"name\": \"description\",\n" + + " \"nullable\": true,\n" + + " \"type\": {\n" + + " \"name\": \"largeutf8\"\n" + + " }\n" + + "}"), + true, + LARGE_UTF8_VALIDATOR), + Arguments.of("active", Types.BooleanType.get(), false, BOOLEAN_VALIDATOR), + // Decimal + Arguments.of("price", Types.DecimalType.of(10, 2), false, DECIMAL_VALIDATOR), + // List + Arguments.of( + "numbers", Types.ListType.of(Types.IntegerType.get(), true), false, LIST_VALIDATOR), + // Map + Arguments.of( + "properties", + Types.MapType.of(Types.StringType.get(), Types.IntegerType.get(), true), + true, + MAP_VALIDATOR), + // Struct + Arguments.of("person", SIMPLE_STRUCT, true, STRUCT_VALIDATOR), + // Nested Struct + Arguments.of("person_nested", NESTED_STRUCT, false, NESTED_STRUCT_VALIDATOR), + Arguments.of( + "person_nested_json", + Types.ExternalType.of(NESTED_STRUCT_JSON), + false, + NESTED_STRUCT_VALIDATOR), + // List of Structs + Arguments.of("items", LIST_OF_STRUCTS, false, LIST_OF_STRUCTS_VALIDATOR)); + } +} diff --git a/docs/generic-lakehouse-catalog.md b/docs/generic-lakehouse-catalog.md new file mode 100644 index 00000000000..0c457e0f1b5 --- /dev/null +++ b/docs/generic-lakehouse-catalog.md @@ -0,0 +1,142 @@ +--- +title: "Lakehouse catalog" +slug: /lakehouse-catalog +keywords: + - lakehouse + - lance + - metadata +license: "This software is licensed under the Apache License version 2." +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Introduction + +TBD. + +### Requirements and limitations + +TBD. + +## Catalog + +### Catalog capabilities + +TBD. + +### Catalog properties + +TBD. + +### Catalog operations + +TBD. + +## Schema + +### Schema capabilities + +TBD. + +### Schema properties + +TBD. + +### Schema operations + +Please refer to [Manage Relational Metadata Using Gravitino](./manage-relational-metadata-using-gravitino.md#schema-operations) for more details. + +## Table + +### Table capabilities + +TBD. + +### Table partitions + +TBD. + +### Table sort orders + +TBD. + +### Table distributions + +TBD. + +### Table column types + +Since Lance uses Apache Arrow as the table schema, the following table shows the mapping between Gravitino types and Arrow types: + +| Gravitino Type | Arrow Type | +|----------------------------------|-----------------------------------------| +| `Struct` | `Struct` | +| `Map` | `Map` | +| `List` | `Array` | +| `Boolean` | `Boolean` | +| `Byte` | `Int8` | +| `Short` | `Int16` | +| `Integer` | `Int32` | +| `Long` | `Int64` | +| `Float` | `Float` | +| `Double` | `Double` | +| `String` | `Utf8` | +| `Binary` | `Binary` | +| `Decimal(p, s)` | `Decimal(p, s)` (128-bit) | +| `Date` | `Date` | +| `Timestamp`/`Timestamp(6)` | `TimestampType withoutZone` | +| `Timestamp(0)` | `TimestampType Second withoutZone` | +| `Timestamp(3)` | `TimestampType Millisecond withoutZone` | +| `Timestamp(9)` | `TimestampType Nanosecond withoutZone` | +| `Timestamp_tz`/`Timestamp_tz(6)` | `TimestampType Microsecond withUtc` | +| `Timestamp_tz(0)` | `TimestampType Second withUtc` | +| `Timestamp_tz(3)` | `TimestampType Millisecond withUtc` | +| `Timestamp_tz(9)` | `TimestampType Nanosecond withUtc` | +| `Time`/`Time(9)` | `Time Nanosecond` | +| `Null` | `Null` | +| `Fixed(n)` | `Fixed-Size Binary(n)` | +| `Interval_year` | `Interval(YearMonth)` | +| `Interval_day` | `Duration(Microsecond)` | +| `External(arrow_field_json_str)` | Any Arrow Field (see note below) | + +`External(arrow_field_json_str)`: + +As the table above shows, Gravitino provides mappings for most common data types. However, +in some cases, you may need to use an Arrow data type that is not directly supported by Gravitino. + +To address this, Gravitino introduces the `External(arrow_field_json_str)` type, +which allows you to define any Arrow data type by providing the JSON string of an Arrow `Field`. + +The JSON string must conform to the Apache Arrow `Field` [specification](https://github.com/apache/arrow-java/blob/ed81e5981a2bee40584b3a411ed755cb4cc5b91f/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java#L80C1-L86C68), +including details such as the field name, data type, and nullability. For example, you can define a `LargeUtf8` type field using its JSON representation. +```json +{ + "name": "col_name", + "nullable": true, + "type": { + "name": "largeutf8" + }, + "children": [] +} +``` + +**Important considerations:** +- The `name` attribute and `nullable` attribute in the JSON string must exactly match the corresponding column name and nullable in the Gravitino table. +- The `children` array should be empty for primitive types. For complex types like `Struct` or `List`, it must contain the definitions of the child fields. + +### Table properties + +TBD. + +### Table indexes + +TBD. + +### Table operations + +Please refer to [Manage Relational Metadata Using Gravitino](./manage-relational-metadata-using-gravitino.md#table-operations) for more details. + +## Object store configuration + +TBD. From a1a76dac2367573aca031f8fac723627e57f1626 Mon Sep 17 00:00:00 2001 From: mchades Date: Wed, 29 Oct 2025 17:38:19 +0800 Subject: [PATCH 2/2] update external types and add union type supports --- .../lance/LanceDataTypeConverter.java | 24 +++++++ .../lance/TestLanceDataTypeConverter.java | 68 ++++++++++++++++++- docs/generic-lakehouse-catalog.md | 20 +++--- 3 files changed, 100 insertions(+), 12 deletions(-) diff --git a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java index ad07a586ba5..9cd5783de1b 100644 --- a/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java +++ b/catalogs/catalog-generic-lakehouse/src/main/java/org/apache/gravitino/catalog/lakehouse/lance/LanceDataTypeConverter.java @@ -23,11 +23,13 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.util.Arrays; +import java.util.List; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.types.DateUnit; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.IntervalUnit; import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; @@ -87,6 +89,28 @@ public Field toArrowField(String name, Type type, boolean nullable) { mapType.valueNullable(), null)), false /*nullable*/))); + + case UNION: + Types.UnionType unionType = (Types.UnionType) type; + List types = + Arrays.stream(unionType.types()) + .map( + t -> + toArrowField( + t.simpleString(), t, true /*nullable*/) // union members are nullable + ) + .toList(); + int[] typeIds = + types.stream() + .mapToInt( + f -> + org.apache.arrow.vector.types.Types.getMinorTypeForArrowType(f.getType()) + .ordinal()) + .toArray(); + FieldType unionField = + new FieldType(nullable, new ArrowType.Union(UnionMode.Sparse, typeIds), null); + return new Field(name, unionField, types); + case EXTERNAL: Types.ExternalType externalType = (Types.ExternalType) type; Field field; diff --git a/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java index f86499112d7..cf28ee74342 100644 --- a/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java +++ b/catalogs/catalog-generic-lakehouse/src/test/java/org/apache/gravitino/catalog/lakehouse/lance/TestLanceDataTypeConverter.java @@ -28,6 +28,7 @@ import java.util.stream.Stream; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.gravitino.rel.types.Type; @@ -158,6 +159,15 @@ public class TestLanceDataTypeConverter { assertInstanceOf(ArrowType.Struct.class, elementField.getFieldType().getType()); assertEquals(2, elementField.getChildren().size()); }; + private static Consumer UNION_VALIDATOR = + field -> { + assertInstanceOf(ArrowType.Union.class, field.getFieldType().getType()); + ArrowType.Union unionType = (ArrowType.Union) field.getFieldType().getType(); + assertEquals(UnionMode.Sparse, unionType.getMode()); + assertEquals(2, field.getChildren().size()); + assertInstanceOf(ArrowType.Int.class, field.getChildren().get(0).getFieldType().getType()); + assertInstanceOf(ArrowType.Utf8.class, field.getChildren().get(1).getFieldType().getType()); + }; @ParameterizedTest @DisplayName("Test conversion of Integer types (Byte, Short, Integer, Long)") @@ -192,6 +202,56 @@ public void testFromGravitinoTimestampWithTz() { assertEquals("UTC", tsArrow.getTimezone()); } + @Test + public void testExternalTypeConversion() { + String expectedColumnName = "col_name"; + boolean expectedNullable = true; + Types.ExternalType externalType = + Types.ExternalType.of( + "{\"name\":\"col_name\",\"nullable\":true," + + "\"type\":{\"name\":\"largeutf8\"},\"children\":[]}"); + Field arrowField = CONVERTER.toArrowField(expectedColumnName, externalType, expectedNullable); + assertEquals(expectedColumnName, arrowField.getName()); + assertEquals(expectedNullable, arrowField.isNullable()); + assertInstanceOf(ArrowType.LargeUtf8.class, arrowField.getFieldType().getType()); + + externalType = + Types.ExternalType.of( + "{\"name\":\"col_name\",\"nullable\":true," + + "\"type\":{\"name\":\"largebinary\"},\"children\":[]}"); + arrowField = CONVERTER.toArrowField(expectedColumnName, externalType, expectedNullable); + assertEquals(expectedColumnName, arrowField.getName()); + assertEquals(expectedNullable, arrowField.isNullable()); + assertInstanceOf(ArrowType.LargeBinary.class, arrowField.getFieldType().getType()); + + externalType = + Types.ExternalType.of( + "{\"name\":\"col_name\",\"nullable\":true," + + "\"type\":{\"name\":\"largelist\"}," + + "\"children\":[" + + "{\"name\":\"element\",\"nullable\":true," + + "\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\": true}," + + "\"children\":[]}]}"); + arrowField = CONVERTER.toArrowField(expectedColumnName, externalType, expectedNullable); + assertEquals(expectedColumnName, arrowField.getName()); + assertEquals(expectedNullable, arrowField.isNullable()); + assertInstanceOf(ArrowType.LargeList.class, arrowField.getFieldType().getType()); + + externalType = + Types.ExternalType.of( + "{\"name\":\"col_name\",\"nullable\":true," + + "\"type\":{\"name\":\"fixedsizelist\", \"listSize\":10}," + + "\"children\":[" + + "{\"name\":\"element\",\"nullable\":true," + + "\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\": true}," + + "\"children\":[]}]}"); + arrowField = CONVERTER.toArrowField(expectedColumnName, externalType, expectedNullable); + assertEquals(expectedColumnName, arrowField.getName()); + assertEquals(expectedNullable, arrowField.isNullable()); + assertInstanceOf(ArrowType.FixedSizeList.class, arrowField.getFieldType().getType()); + assertEquals(10, ((ArrowType.FixedSizeList) arrowField.getFieldType().getType()).getListSize()); + } + @ParameterizedTest(name = "[{index}] name={0}, type={1}, nullable={2}") @MethodSource("toArrowFieldArguments") @DisplayName("Test toArrowField for various types") @@ -256,6 +316,12 @@ private static Stream toArrowFieldArguments() { false, NESTED_STRUCT_VALIDATOR), // List of Structs - Arguments.of("items", LIST_OF_STRUCTS, false, LIST_OF_STRUCTS_VALIDATOR)); + Arguments.of("items", LIST_OF_STRUCTS, false, LIST_OF_STRUCTS_VALIDATOR), + // Union + Arguments.of( + "union_field", + Types.UnionType.of(Types.IntegerType.get(), Types.StringType.get()), + true, + UNION_VALIDATOR)); } } diff --git a/docs/generic-lakehouse-catalog.md b/docs/generic-lakehouse-catalog.md index 0c457e0f1b5..35eaeb46602 100644 --- a/docs/generic-lakehouse-catalog.md +++ b/docs/generic-lakehouse-catalog.md @@ -109,17 +109,15 @@ To address this, Gravitino introduces the `External(arrow_field_json_str)` type, which allows you to define any Arrow data type by providing the JSON string of an Arrow `Field`. The JSON string must conform to the Apache Arrow `Field` [specification](https://github.com/apache/arrow-java/blob/ed81e5981a2bee40584b3a411ed755cb4cc5b91f/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java#L80C1-L86C68), -including details such as the field name, data type, and nullability. For example, you can define a `LargeUtf8` type field using its JSON representation. -```json -{ - "name": "col_name", - "nullable": true, - "type": { - "name": "largeutf8" - }, - "children": [] -} -``` +including details such as the field name, data type, and nullability. +Here are some examples of how to use `External` type for various Arrow types that are not natively supported by Gravitino: + +| Arrow Type | External type | +|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `Large Utf8` | `External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largeutf8\"},\"children\":[]}")` | +| `Large Binary` | `External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largebinary\"},\"children\":[]}")` | +| `Large List` | `External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"largelist\"},\"children\":[{\"name\":\"element\",\"nullable\":true,\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\": true},\"children\":[]}]}")` | +| `Fixed-Size List` | `External("{\"name\":\"col_name\",\"nullable\":true,\"type\":{\"name\":\"fixedsizelist\", \"listSize\":10},\"children\":[{\"name\":\"element\",\"nullable\":true,\"type\":{\"name\":\"int\", \"bitWidth\":32, \"isSigned\": true},\"children\":[]}]}")` | **Important considerations:** - The `name` attribute and `nullable` attribute in the JSON string must exactly match the corresponding column name and nullable in the Gravitino table.