Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/spelling/expect.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ deserialized
deserializer
deserializing
dest
DFB
dfr
dfs
dfw
Expand Down Expand Up @@ -418,6 +419,7 @@ uoi
uri
urie
url
user'id
usoi
utf
Utils
Expand Down
5 changes: 5 additions & 0 deletions ql/src/test/queries/clientpositive/create_struct_table.q
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ load data local inpath '../../data/files/kv1.txt'
overwrite into table varchar_fields;

SELECT strct, strct.a, strct.b, strct.c FROM varchar_fields LIMIT 10;

create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>);
insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde'));
explain select * from complex_field_names;
select * from complex_field_names;
49 changes: 49 additions & 0 deletions ql/src/test/results/clientpositive/llap/create_struct_table.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,52 @@ POSTHOOK: Input: default@varchar_fields
{"a":278,"b":"val_2","c":null} 278 val_2 NULL
{"a":98,"b":"val_9","c":null} 98 val_9 NULL
{"a":484,"b":"val_4","c":null} 484 val_4 NULL
PREHOOK: query: create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@complex_field_names
POSTHOOK: query: create table complex_field_names(strct struct<`user-_$+/^ |id`:int, `♠️♥️`:string>)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@complex_field_names
PREHOOK: query: insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde'))
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@complex_field_names
POSTHOOK: query: insert into complex_field_names values (named_struct('user-_$+/^ |id', 1, '♠️♥️', 'abcde'))
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@complex_field_names
POSTHOOK: Lineage: complex_field_names.strct SCRIPT []
PREHOOK: query: explain select * from complex_field_names
PREHOOK: type: QUERY
PREHOOK: Input: default@complex_field_names
#### A masked pattern was here ####
POSTHOOK: query: explain select * from complex_field_names
POSTHOOK: type: QUERY
POSTHOOK: Input: default@complex_field_names
#### A masked pattern was here ####
STAGE DEPENDENCIES:
Stage-0 is a root stage

STAGE PLANS:
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
TableScan
alias: complex_field_names
Select Operator
expressions: strct (type: struct<user-_$+/^ |id:int,♠️♥️:string>)
outputColumnNames: _col0
ListSink

PREHOOK: query: select * from complex_field_names
PREHOOK: type: QUERY
PREHOOK: Input: default@complex_field_names
#### A masked pattern was here ####
POSTHOOK: query: select * from complex_field_names
POSTHOOK: type: QUERY
POSTHOOK: Input: default@complex_field_names
#### A masked pattern was here ####
{"user-_$+/^ |id":1,"♠️♥️":"abcde"}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hive.serde2.typeinfo;

import com.google.common.collect.Lists;
import java.lang.reflect.Field;
import java.lang.reflect.GenericArrayType;
import java.lang.reflect.Method;
Expand All @@ -33,6 +34,7 @@
import java.util.concurrent.ConcurrentHashMap;

import com.google.common.collect.ImmutableSet;
import java.util.stream.Collectors;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde.serdeConstants;
Expand All @@ -51,16 +53,14 @@
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveTypeEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* TypeInfoUtils.
*
*/
public final class TypeInfoUtils {

protected static final Logger LOG = LoggerFactory.getLogger(TypeInfoUtils.class);
private static final Set<Integer> NON_TYPE_CHARACTERS = Lists.newArrayList('<', '>', '(', ')', '\'', ',', ':', ';')
.stream().map(x -> (int) x).collect(Collectors.toSet());

public static List<PrimitiveCategory> numericTypeList = new ArrayList<PrimitiveCategory>();
// The ordering of types here is used to determine which numeric types
Expand Down Expand Up @@ -234,15 +234,6 @@ public static List<TypeInfo> getParameterTypeInfos(Method m, int size) {
return typeInfos;
}

public static boolean hasParameters(String typeName) {
int idx = typeName.indexOf('(');
if (idx == -1) {
return false;
} else {
return true;
}
}

public static String getBaseName(String typeName) {
int idx = typeName.indexOf('(');
if (idx == -1) {
Expand Down Expand Up @@ -291,8 +282,8 @@ public String toString() {
}
};

private static boolean isTypeChar(char c) {
return Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == ' ' || c == '$';
private static boolean isTypeChar(int codePoint) {
return !NON_TYPE_CHARACTERS.contains(codePoint);
}

/**
Expand All @@ -307,35 +298,37 @@ private static boolean isTypeChar(char c) {
* in any type in Hive, it is safe to do so.
*/
private static ArrayList<Token> tokenize(String typeInfoString) {
ArrayList<Token> tokens = new ArrayList<Token>(0);
ArrayList<Token> tokens = new ArrayList<>(0);

int[] codePoints = typeInfoString.codePoints().toArray();
int begin = 0;
int end = 1;
while (end <= typeInfoString.length()) {
while (end <= codePoints.length) {
// last character ends a token?
// if there are quotes, all the text between the quotes
// is considered a single token (this can happen for
// timestamp with local time-zone)
if (begin > 0 &&
typeInfoString.charAt(begin - 1) == '(' &&
typeInfoString.charAt(begin) == '\'') {
codePoints[begin - 1] == '(' &&
codePoints[begin] == '\'') {
// Ignore starting quote
begin++;
do {
end++;
} while (typeInfoString.charAt(end) != '\'');
} else if (typeInfoString.charAt(begin) == '\'' &&
typeInfoString.charAt(begin + 1) == ')') {
} while (codePoints[end] != '\'');
} else if (codePoints[begin] == '\'' &&
codePoints[begin + 1] == ')') {
// Ignore closing quote
begin++;
end++;
}
if (end == typeInfoString.length()
|| !isTypeChar(typeInfoString.charAt(end - 1))
|| !isTypeChar(typeInfoString.charAt(end))) {
if (end == codePoints.length
|| !isTypeChar(codePoints[end - 1])
|| !isTypeChar(codePoints[end])) {
Token t = new Token();
t.position = begin;
t.text = typeInfoString.substring(begin, end).trim();
t.isType = isTypeChar(typeInfoString.charAt(begin));
t.text = new String(codePoints, begin, end - begin).trim();
t.isType = isTypeChar(codePoints[begin]);
tokens.add(t);
begin = end;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@



import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import static org.junit.Assert.assertEquals;

import java.util.List;
import org.apache.hadoop.util.Lists;
import org.junit.Test;

/**
Expand Down Expand Up @@ -65,7 +66,15 @@ public void testTypeInfoParser() {
"char(123,)",
"char()",
"char(",
"decimal()"
"decimal()",
"struct<user<id:int>",
"struct<user>id:int>",
"struct<user(id:int>",
"struct<user)id:int>",
"struct<user'id:int>",
"struct<user,id:int>",
"struct<user:id:int>",
"struct<user;id:int>",
};

for (String typeString : validTypeStrings) {
Expand Down Expand Up @@ -125,4 +134,54 @@ public void testDecimal() {
assertEquals("Failed for " + testCase.typeString, testCase.expectedScale, decimalType.getScale());
}
}

@Test
public void testStruct() {
String[] testCases = {
"struct<user id:int,user group: int>",
"struct<user-_$+/^ |id:int,user-_$+/^ |group: int>",
"struct<\uD842\uDFB7野家♥️:string>" // surrogate pairs
};
StructTypeInfo[] expected = {
new StructTypeInfo(
Lists.newArrayList("user id", "user group"),
Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo)
),
new StructTypeInfo(
Lists.newArrayList("user-_$+/^ |id", "user-_$+/^ |group"),
Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo)
),
new StructTypeInfo(
Lists.newArrayList("\uD842\uDFB7野家♥️"),
Lists.newArrayList(TypeInfoFactory.stringTypeInfo)
)
};
assertEquals(expected.length, testCases.length);

for (int i = 0; i < testCases.length; i++) {
String testCase = testCases[i];
StructTypeInfo expectedType = expected[i];
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(testCase);
assertEquals(expectedType, typeInfo);
}
}

@Test
public void testMultipleValues() {
String str = "int,struct<user-_$+/^ |id:int,user-_$+/^ |group: int>:struct<\uD842\uDFB7野家♥️:string>;double";
List<TypeInfo> types = TypeInfoUtils.getTypeInfosFromTypeString(str);
List<TypeInfo> expected = Lists.newArrayList(
TypeInfoFactory.intTypeInfo,
new StructTypeInfo(
Lists.newArrayList("user-_$+/^ |id", "user-_$+/^ |group"),
Lists.newArrayList(TypeInfoFactory.intTypeInfo, TypeInfoFactory.intTypeInfo)
),
new StructTypeInfo(
Lists.newArrayList("\uD842\uDFB7野家♥️"),
Lists.newArrayList(TypeInfoFactory.stringTypeInfo)
),
TypeInfoFactory.doubleTypeInfo
);
assertEquals(expected, types);
}
}
Loading