From ff24dd8a1e14d769c68135bba6a3ca0c986162b7 Mon Sep 17 00:00:00 2001 From: okumin Date: Sat, 12 Apr 2025 19:03:28 +0900 Subject: [PATCH 1/2] HIVE-13748: TypeInfoParser cannot handle symbols in the field name of STRUCT --- .../clientpositive/create_struct_table.q | 5 ++ .../llap/create_struct_table.q.out | 49 ++++++++++++++ .../hive/serde2/typeinfo/TypeInfoUtils.java | 47 ++++++-------- .../serde2/typeinfo/TestTypeInfoUtils.java | 65 ++++++++++++++++++- 4 files changed, 136 insertions(+), 30 deletions(-) diff --git a/ql/src/test/queries/clientpositive/create_struct_table.q b/ql/src/test/queries/clientpositive/create_struct_table.q index fafe52c8150b..1e7b43dee877 100644 --- a/ql/src/test/queries/clientpositive/create_struct_table.q +++ b/ql/src/test/queries/clientpositive/create_struct_table.q @@ -33,3 +33,8 @@ load data local inpath '../../data/files/kv1.txt' overwrite into table varchar_fields; SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10; + +create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>); +insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde')); +explain select * from complex_field_names; +select * from complex_field_names; diff --git a/ql/src/test/results/clientpositive/llap/create_struct_table.q.out b/ql/src/test/results/clientpositive/llap/create_struct_table.q.out index f2fd8930eeb5..0387eda49a0a 100644 --- a/ql/src/test/results/clientpositive/llap/create_struct_table.q.out +++ b/ql/src/test/results/clientpositive/llap/create_struct_table.q.out @@ -124,3 +124,52 @@ POSTHOOK: Input: default@varchar_fields {"a":278,"b":"val_2","c":null} 278 val_2 NULL {"a":98,"b":"val_9","c":null} 98 val_9 NULL {"a":484,"b":"val_4","c":null} 484 val_4 NULL +PREHOOK: query: create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@complex_field_names +POSTHOOK: query: create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@complex_field_names +PREHOOK: query: insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@complex_field_names +POSTHOOK: query: insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@complex_field_names +POSTHOOK: Lineage: complex_field_names.strct SCRIPT [] +PREHOOK: query: explain select * from complex_field_names +PREHOOK: type: QUERY +PREHOOK: Input: default@complex_field_names +#### A masked pattern was here #### +POSTHOOK: query: explain select * from complex_field_names +POSTHOOK: type: QUERY +POSTHOOK: Input: default@complex_field_names +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: complex_field_names + Select Operator + expressions: strct (type: struct) + outputColumnNames: _col0 + ListSink + +PREHOOK: query: select * from complex_field_names +PREHOOK: type: QUERY +PREHOOK: Input: default@complex_field_names +#### A masked pattern was here #### +POSTHOOK: query: select * from complex_field_names +POSTHOOK: type: QUERY +POSTHOOK: Input: default@complex_field_names +#### A masked pattern was here #### +{"user-_$+/^ |id":1,"♠️♥️":"abcde"} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java index 4517dbb200c1..44174f3604ff 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hive.serde2.typeinfo; +import com.google.common.collect.Lists; import java.lang.reflect.Field; import java.lang.reflect.GenericArrayType; import java.lang.reflect.Method; @@ -33,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap; import com.google.common.collect.ImmutableSet; +import java.util.stream.Collectors; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.serde.serdeConstants; @@ -51,16 +53,14 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveTypeEntry; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * TypeInfoUtils. * */ public final class TypeInfoUtils { - - protected static final Logger LOG = LoggerFactory.getLogger(TypeInfoUtils.class); + private static final Set NON_TYPE_CHARACTERS = Lists.newArrayList('<', '>', '(', ')', '\'', ',', ':', ';') + .stream().map(x -> (int) x).collect(Collectors.toSet()); public static List numericTypeList = new ArrayList(); // The ordering of types here is used to determine which numeric types @@ -234,15 +234,6 @@ public static List getParameterTypeInfos(Method m, int size) { return typeInfos; } - public static boolean hasParameters(String typeName) { - int idx = typeName.indexOf('('); - if (idx == -1) { - return false; - } else { - return true; - } - } - public static String getBaseName(String typeName) { int idx = typeName.indexOf('('); if (idx == -1) { @@ -291,8 +282,8 @@ public String toString() { } }; - private static boolean isTypeChar(char c) { - return Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == ' ' || c == '$'; + private static boolean isTypeChar(int codePoint) { + return !NON_TYPE_CHARACTERS.contains(codePoint); } /** @@ -307,35 +298,37 @@ private static boolean isTypeChar(char c) { * in any type in Hive, it is safe to do so. */ private static ArrayList tokenize(String typeInfoString) { - ArrayList tokens = new ArrayList(0); + ArrayList tokens = new ArrayList<>(0); + + int[] codePoints = typeInfoString.codePoints().toArray(); int begin = 0; int end = 1; - while (end <= typeInfoString.length()) { + while (end <= codePoints.length) { // last character ends a token? // if there are quotes, all the text between the quotes // is considered a single token (this can happen for // timestamp with local time-zone) if (begin > 0 && - typeInfoString.charAt(begin - 1) == '(' && - typeInfoString.charAt(begin) == '\'') { + codePoints[begin - 1] == '(' && + codePoints[begin] == '\'') { // Ignore starting quote begin++; do { end++; - } while (typeInfoString.charAt(end) != '\''); - } else if (typeInfoString.charAt(begin) == '\'' && - typeInfoString.charAt(begin + 1) == ')') { + } while (codePoints[end] != '\''); + } else if (codePoints[begin] == '\'' && + codePoints[begin + 1] == ')') { // Ignore closing quote begin++; end++; } - if (end == typeInfoString.length() - || !isTypeChar(typeInfoString.charAt(end - 1)) - || !isTypeChar(typeInfoString.charAt(end))) { + if (end == codePoints.length + || !isTypeChar(codePoints[end - 1]) + || !isTypeChar(codePoints[end])) { Token t = new Token(); t.position = begin; - t.text = typeInfoString.substring(begin, end).trim(); - t.isType = isTypeChar(typeInfoString.charAt(begin)); + t.text = new String(codePoints, begin, end - begin).trim(); + t.isType = isTypeChar(codePoints[begin]); tokens.add(t); begin = end; } diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/typeinfo/TestTypeInfoUtils.java b/serde/src/test/org/apache/hadoop/hive/serde2/typeinfo/TestTypeInfoUtils.java index a29f5d1fe66b..c1b3b4219b10 100644 --- a/serde/src/test/org/apache/hadoop/hive/serde2/typeinfo/TestTypeInfoUtils.java +++ b/serde/src/test/org/apache/hadoop/hive/serde2/typeinfo/TestTypeInfoUtils.java @@ -20,9 +20,10 @@ -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import static org.junit.Assert.assertEquals; + +import java.util.List; +import org.apache.hadoop.util.Lists; import org.junit.Test; /** @@ -65,7 +66,15 @@ public void testTypeInfoParser() { "char(123,)", "char()", "char(", - "decimal()" + "decimal()", + "struct", + "structid:int>", + "struct", + "struct", + "struct", + "struct", + "struct", + "struct", }; for (String typeString : validTypeStrings) { @@ -125,4 +134,54 @@ public void testDecimal() { assertEquals("Failed for " + testCase.typeString, testCase.expectedScale, decimalType.getScale()); } } + + @Test + public void testStruct() { + String[] testCases = { + "struct", + "struct", + "struct<\uD842\uDFB7野家♥️:string>" // surrogate pairs + }; + StructTypeInfo[] expected = { + new StructTypeInfo( + Lists.newArrayList("user id", "user group"), + Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo) + ), + new StructTypeInfo( + Lists.newArrayList("user-_$+/^ |id", "user-_$+/^ |group"), + Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo) + ), + new StructTypeInfo( + Lists.newArrayList("\uD842\uDFB7野家♥️"), + Lists.newArrayList(TypeInfoFactory.stringTypeInfo) + ) + }; + assertEquals(expected.length, testCases.length); + + for (int i = 0; i < testCases.length; i++) { + String testCase = testCases[i]; + StructTypeInfo expectedType = expected[i]; + TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(testCase); + assertEquals(expectedType, typeInfo); + } + } + + @Test + public void testMultipleValues() { + String str = "int,struct:struct<\uD842\uDFB7野家♥️:string>;double"; + List types = TypeInfoUtils.getTypeInfosFromTypeString(str); + List expected = Lists.newArrayList( + TypeInfoFactory.intTypeInfo, + new StructTypeInfo( + Lists.newArrayList("user-_$+/^ |id", "user-_$+/^ |group"), + Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo) + ), + new StructTypeInfo( + Lists.newArrayList("\uD842\uDFB7野家♥️"), + Lists.newArrayList(TypeInfoFactory.stringTypeInfo) + ), + TypeInfoFactory.doubleTypeInfo + ); + assertEquals(expected, types); + } } From e549b26cb6fdc5350933a43460b8667c2a73debc Mon Sep 17 00:00:00 2001 From: okumin Date: Sat, 12 Apr 2025 19:32:49 +0900 Subject: [PATCH 2/2] Add some words to expect.txt --- .github/actions/spelling/expect.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index d7d233d3b4a6..b90dac5d90f1 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -97,6 +97,7 @@ deserialized deserializer deserializing dest +DFB dfr dfs dfw @@ -418,6 +419,7 @@ uoi uri urie url +user'id usoi utf Utils