Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,6 @@ private FunctionReturnTypes() {
return typeFactory.createArrayType(typeFactory.createMapType(strType, strType), -1);
};

public static final SqlReturnTypeInference EXTRACT_UNION_FUNCTION_RETURN_STRATEGY = opBinding -> {
int numArgs = opBinding.getOperandCount();
Preconditions.checkState(numArgs == 1 || numArgs == 2);
// 1-arg case
if (numArgs == 1) {
return opBinding.getOperandType(0);
}
// 2-arg case
else {
int ordinal = opBinding.getOperandLiteralValue(1, Integer.class);
return opBinding.getOperandType(0).getFieldList().get(ordinal).getType();
}
};

public static final SqlReturnTypeInference ARRAY_OF_ARG0_TYPE =
opBinding -> opBinding.getTypeFactory().createArrayType(opBinding.getOperandType(0), -1);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/**
* Copyright 2018-2021 LinkedIn Corporation. All rights reserved.
* Licensed under the BSD-2 Clause license.
* See LICENSE in the project root for license information.
*/
package com.linkedin.coral.hive.hive2rel.functions;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rel.type.RelDataTypeField;
import org.apache.calcite.sql.type.SqlReturnTypeInference;


/**
* A utility class to coalesce the {@link RelDataType} of struct between Trino's representation and
* hive's extract_union UDF's representation on exploded union.
*
*/
public class CoalesceStructUtility {

/**
* The semantics for the extract_union is now pass-through: Assuming the engine's reader could deal with
* union type and explode it into a struct, this extract_union UDF's return type will simply follow exploded struct's
* schema based on how many arguments passed by users.
*/
public static final SqlReturnTypeInference EXTRACT_UNION_FUNCTION_RETURN_STRATEGY = opBinding -> {
int numArgs = opBinding.getOperandCount();
Preconditions.checkState(numArgs == 1 || numArgs == 2);
// 1-arg case
if (numArgs == 1) {
return opBinding.getOperandType(0);
}
// 2-arg case
else {
int ordinal = opBinding.getOperandLiteralValue(1, Integer.class);
return opBinding.getOperandType(0).getFieldList().get(ordinal).getType();
}
};

/**
* Represents the return type for the coalesce_struct UDF that is built for bridging the schema difference
* between extract_union UDF's processed schema of union field in Coral IR (let's call it struct_ex) and
* Trino's schema when deserializing union field from its reader.
* (Let's call it struct_tr, See https://github.com/trinodb/trino/pull/3483 for details).
*
* The main reason we need this briding capability is that we have existing users relying on the
* schema of struct_ex. While the underlying reader(e.g. the trino one referenced above) starts to interpret the union
* in its own format, Coral tries to maintain backward compatibility on top of that. Notably we also have
* Iceberg reader does the same, see Linkedin's (temporary) fork on Iceberg:
* https://github.com/linkedin/iceberg/pull/84 (Avro)
* https://github.com/linkedin/iceberg/pull/85 (ORC)
*
*
* Further details:
* struct_tr looks like:
* struct<tag:int, field0:type0, field1:type1, ... fieldN:typeN>
*
* struct_ex looks like:
* struct<tag_0:type0, tag_1:type1, ... tag_N:typeN>
*
* This new UDF could be stated as the following signatures:
* def coalesce_struct(struct:struct_tr) : struct_ex = {...}
* def coalesce_struct(struct:struct_tr, ordinal: int): field_at_ordinal = {...}
*
*/
public static final SqlReturnTypeInference COALESCE_STRUCT_FUNCTION_RETURN_STRATEGY = opBinding -> {
int numArgs = opBinding.getOperandCount();
RelDataTypeFactory typeFactory = opBinding.getTypeFactory();
Preconditions.checkState(numArgs == 1 || numArgs == 2);
RelDataType coalescedDataType = coalesce(opBinding.getOperandType(0), typeFactory);
// 1-arg case
if (numArgs == 1) {
return coalescedDataType;
}
// 2-arg case
else {
int ordinal = opBinding.getOperandLiteralValue(1, Integer.class);
return coalescedDataType.getFieldList().get(ordinal).getType();
}
};
private static final String TRINO_PREFIX = "field";
private static final String HIVE_EXTRACT_UNION_PREFIX = "tag_";

private CoalesceStructUtility() {
// Utility class, does nothing in constructor
}

/**
* Converting a {@link RelDataType} that could potentially contain a Trino-format exploded-union(i.e. a struct
* in a format of {tag, field0, field1, ..., fieldN} to represent a union after being deserialized)
* into a exploded-union that complies with Hive's extract_union UDF format
* (i.e. a struct as {tag_0, tag_1, ..., tag_{N}} to represent a union after being deserialized)
*
* For more information, check: https://github.com/trinodb/trino/pull/3483
*/
@VisibleForTesting
static RelDataType coalesce(RelDataType inputNode, RelDataTypeFactory typeFactory) {
// Using type information implicitly carried in the object of RelDateType
// instead of getting down to SqlTypeName since the former contains enough categorization
// of types to achieve the purpose for this method.

if (inputNode.isStruct()) {
List<String> fieldNames = inputNode.getFieldNames();
return coalesceStruct(inputNode, isTrinoStructPattern(fieldNames), typeFactory);
} else if (inputNode.getKeyType() != null) {
return coalesceMap(inputNode, typeFactory);
} else if (inputNode.getComponentType() != null) {
return coalesceCollection(inputNode, typeFactory);
} else {
return inputNode;
}
}

private static RelDataType coalesceStruct(RelDataType inputNode, boolean coalesceRequired,
RelDataTypeFactory typeFactory) {
List<String> originalNames = inputNode.getFieldNames();
List<String> convertedNames =
coalesceRequired ? originalNames.stream().map(x -> x.replaceFirst(TRINO_PREFIX, HIVE_EXTRACT_UNION_PREFIX))
.collect(Collectors.toList()).subList(1, originalNames.size()) : originalNames;
List<RelDataType> originalTypes =
inputNode.getFieldList().stream().map(RelDataTypeField::getType).collect(Collectors.toList());
List<RelDataType> convertedTypes =
new ArrayList<>(coalesceRequired ? originalTypes.size() - 1 : originalTypes.size());
for (int i = coalesceRequired ? 1 : 0; i < originalTypes.size(); i++) {
convertedTypes.add(coalesce(originalTypes.get(i), typeFactory));
}

return typeFactory.createStructType(convertedTypes, convertedNames);
}

private static RelDataType coalesceMap(RelDataType inputNode, RelDataTypeFactory typeFactory) {
RelDataType coalescedKeyType = coalesce(inputNode.getKeyType(), typeFactory);
RelDataType coalescedValueType = coalesce(inputNode.getValueType(), typeFactory);
return typeFactory.createMapType(coalescedKeyType, coalescedValueType);
}

private static RelDataType coalesceCollection(RelDataType inputNode, RelDataTypeFactory typeFactory) {
RelDataType coalescedComponentType = coalesce(inputNode.getComponentType(), typeFactory);
return typeFactory.createArrayType(coalescedComponentType, -1);
}

/**
* Trino's pattern has two elements:
* - The first element has to be "tag".
* - The following elements have to follow the naming pattern as "field{N}" where N
* represents the position of this element in the struct, starting from 0.
*/
@VisibleForTesting
static boolean isTrinoStructPattern(List<String> fieldNames) {
if (fieldNames.isEmpty() || !fieldNames.get(0).equals("tag")) {
return false;
} else {
boolean flag = true;
StringBuilder fieldNameRef = new StringBuilder("field");
for (int i = 1; i < fieldNames.size(); i++) {
int index = i - 1;
fieldNameRef.append(index);
if (!fieldNameRef.toString().equals(fieldNames.get(i))) {
flag = false;
break;
}
fieldNameRef.delete(5, fieldNameRef.length());
}
return flag;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import com.linkedin.coral.common.functions.OperandTypeInference;
import com.linkedin.coral.common.functions.SameOperandTypeExceptFirstOperandChecker;

import static com.linkedin.coral.hive.hive2rel.functions.CoalesceStructUtility.*;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.*;
import static org.apache.calcite.sql.fun.SqlStdOperatorTable.*;
import static org.apache.calcite.sql.type.OperandTypes.*;
Expand Down Expand Up @@ -373,7 +374,9 @@ public boolean isOptional(int i) {

createAddUserDefinedFunction("array_contains", ReturnTypes.BOOLEAN, family(SqlTypeFamily.ARRAY, SqlTypeFamily.ANY));
createAddUserDefinedFunction("sort_array", ARG0, ARRAY);
createAddUserDefinedFunction("extract_union", FunctionReturnTypes.EXTRACT_UNION_FUNCTION_RETURN_STRATEGY,
createAddUserDefinedFunction("extract_union", EXTRACT_UNION_FUNCTION_RETURN_STRATEGY,
or(ANY, family(SqlTypeFamily.ANY, SqlTypeFamily.INTEGER)));
createAddUserDefinedFunction("coalesce_struct", COALESCE_STRUCT_FUNCTION_RETURN_STRATEGY,
or(ANY, family(SqlTypeFamily.ANY, SqlTypeFamily.INTEGER)));

// LinkedIn UDFs: Dali stores mapping from UDF name to the implementing Java class as table properties
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import com.linkedin.coral.common.HiveMscAdapter;
import com.linkedin.coral.common.HiveSchema;
import com.linkedin.coral.common.HiveTable;

Expand Down Expand Up @@ -79,7 +80,7 @@ public void testTable() {
}

@Test
public void testTableWithUnion() throws Exception {
public void testTableWithUnion() {
final RelDataTypeFactory typeFactory = new JavaTypeFactoryImpl();

// test handling of union
Expand All @@ -95,6 +96,36 @@ public void testTableWithUnion() throws Exception {
assertEquals(rowType.toString(), expectedTypeString);
}

@Test
public void testTableWithUnionComplex() throws Exception {
// Two complex scenarios for union:
// 1. nested union: a struct within a union that has another union as one of the member types.
// 2. when there's existing extract_union UDF (where we replace it with coalesce_struct when
// the reader returns a trino-compliant schema for a union field)
final RelDataTypeFactory typeFactory = new JavaTypeFactoryImpl();
// Schema: foo uniontype<int, double, struct<a:int, b:uniontype<int, double>>>
// it should become struct<tag:int, field0:int, field1:double, field2: struct<a:int,b:struct<tag:int, field0:int, field1:double>>>
Table nestedUnionTable = getTable("default", "nested_union");
RelDataType rowType = nestedUnionTable.getRowType(typeFactory);
assertNotNull(rowType);

String expectedTypeString =
"RecordType(" + "RecordType(" + "INTEGER tag, INTEGER field0, DOUBLE field1, RecordType("
+ "INTEGER a, RecordType(INTEGER tag, INTEGER field0, DOUBLE field1) b)" + " field2) foo)";
assertEquals(rowType.toString(), expectedTypeString);

// Case for with extract_union as part of view definition.
// Put the alias of foo as bar. The outcome type complies with extract_union's schema recursively

HiveMscAdapter mscAdapter = new HiveMscAdapter(hive.getMetastoreClient());
HiveToRelConverter converter = new HiveToRelConverter(mscAdapter);
RelDataType rowType2 = converter.convertSql("SELECT coalesce_struct(foo) AS bar from nested_union").getRowType();
assertNotNull(rowType2);
expectedTypeString = "RecordType(" + "RecordType(" + "INTEGER tag_0, DOUBLE tag_1, "
+ "RecordType(INTEGER a, RecordType(INTEGER tag_0, DOUBLE tag_1) b) tag_2) bar)";
assertEquals(rowType2.toString(), expectedTypeString);
}

@Test
public void testGetDaliFunctionParams() throws HiveException, TException {
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,18 +191,24 @@ public static TestHive setupDefaultHive(HiveConf conf) throws IOException {
driver.run(
"CREATE TABLE IF NOT EXISTS union_table(foo uniontype<int, double, array<string>, struct<a:int,b:string>>)");

testHive.databases =
ImmutableList.of(new TestHive.DB("test", ImmutableList.of("tableOne", "tableTwo", "tableOneView")),
new TestHive.DB("default",
ImmutableList.of("bar", "complex", "foo", "foo_view", "null_check_view", "null_check_wrapper",
"schema_evolve", "view_schema_evolve", "view_schema_evolve_wrapper", "union_table")),
new TestHive.DB("fuzzy_union",
ImmutableList.of("tableA", "tableB", "tableC", "union_view", "union_view_with_more_than_two_tables",
"union_view_with_alias", "union_view_single_branch_evolved",
"union_view_double_branch_evolved_different", "union_view_map_with_struct_value_evolved",
"union_view_array_with_struct_value_evolved", "union_view_deeply_nested_struct_evolved",
"union_view_more_than_two_branches_evolved",
"union_view_same_schema_evolution_with_different_ordering")));
// Nested union case.
// We don't put a union directly under a union since sources like https://avro.apache.org/docs/current/spec.html#Unions
// explicitly put that union cannot be directly nested under a union.
driver.run(
"CREATE TABLE IF NOT EXISTS nested_union(foo uniontype<int, double, struct<a:int, b:uniontype<int, double>>>)");

testHive.databases = ImmutableList.of(
new TestHive.DB("test", ImmutableList.of("tableOne", "tableTwo", "tableOneView")),
new TestHive.DB("default",
ImmutableList.of("bar", "complex", "foo", "foo_view", "null_check_view", "null_check_wrapper",
"schema_evolve", "view_schema_evolve", "view_schema_evolve_wrapper", "union_table", "nested_union")),
new TestHive.DB("fuzzy_union",
ImmutableList.of("tableA", "tableB", "tableC", "union_view", "union_view_with_more_than_two_tables",
"union_view_with_alias", "union_view_single_branch_evolved",
"union_view_double_branch_evolved_different", "union_view_map_with_struct_value_evolved",
"union_view_array_with_struct_value_evolved", "union_view_deeply_nested_struct_evolved",
"union_view_more_than_two_branches_evolved",
"union_view_same_schema_evolution_with_different_ordering")));

// add some Dali functions to table properties
IMetaStoreClient msc = testHive.getMetastoreClient();
Expand Down
Loading