-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PARQUET-968 Add Hive/Presto support in ProtoParquet #411
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,12 +24,14 @@ | |
import com.twitter.elephantbird.util.Protobufs; | ||
import org.apache.parquet.column.Dictionary; | ||
import org.apache.parquet.io.InvalidRecordException; | ||
import org.apache.parquet.io.ParquetDecodingException; | ||
import org.apache.parquet.io.api.Binary; | ||
import org.apache.parquet.io.api.Converter; | ||
import org.apache.parquet.io.api.GroupConverter; | ||
import org.apache.parquet.io.api.PrimitiveConverter; | ||
import org.apache.parquet.schema.GroupType; | ||
import org.apache.parquet.schema.IncompatibleSchemaModificationException; | ||
import org.apache.parquet.schema.OriginalType; | ||
import org.apache.parquet.schema.Type; | ||
|
||
import java.util.HashMap; | ||
|
@@ -129,10 +131,14 @@ public void add(Object value) { | |
}; | ||
} | ||
|
||
return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType); | ||
OriginalType originalType = parquetType.getOriginalType() == null ? OriginalType.UTF8 : parquetType.getOriginalType(); | ||
switch (originalType) { | ||
case LIST: return new ListConverter(parentBuilder, fieldDescriptor, parquetType); | ||
case MAP: return new MapConverter(parentBuilder, fieldDescriptor, parquetType); | ||
default: return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType); | ||
} | ||
} | ||
|
||
|
||
private Converter newScalarConverter(ParentValueContainer pvc, Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { | ||
|
||
JavaType javaType = fieldDescriptor.getJavaType(); | ||
|
@@ -345,4 +351,121 @@ public void addBinary(Binary binary) { | |
} | ||
|
||
} | ||
|
||
/** | ||
* This class unwraps the additional LIST wrapper and makes it possible to read the underlying data and then convert | ||
* it to protobuf. | ||
* <p> | ||
* Consider the following protobuf schema: | ||
* message SimpleList { | ||
* repeated int64 first_array = 1; | ||
* } | ||
* <p> | ||
* A LIST wrapper is created in parquet for the above mentioned protobuf schema: | ||
* message SimpleList { | ||
* required group first_array (LIST) = 1 { | ||
* repeated int32 element; | ||
* } | ||
* } | ||
* <p> | ||
* The LIST wrappers are used by 3rd party tools, such as Hive, to read parquet arrays. The wrapper contains | ||
* one only one field: either a primitive field (like in the example above, where we have an array of ints) or | ||
* another group (array of messages). | ||
*/ | ||
final class ListConverter extends GroupConverter { | ||
private final Converter converter; | ||
private final boolean listOfMessage; | ||
|
||
public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { | ||
OriginalType originalType = parquetType.getOriginalType(); | ||
if (originalType != OriginalType.LIST) { | ||
throw new ParquetDecodingException("Expected LIST wrapper. Found: " + originalType + " instead."); | ||
} | ||
|
||
listOfMessage = fieldDescriptor.getJavaType() == JavaType.MESSAGE; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This variable is unused. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done in costimuraru#2 |
||
|
||
Type parquetSchema; | ||
if (parquetType.asGroupType().containsField("list")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NIPS: We could extract local variable with explaining name. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done in costimuraru#2 |
||
parquetSchema = parquetType.asGroupType().getType("list"); | ||
if (parquetSchema.asGroupType().containsField("element")) { | ||
parquetSchema = parquetSchema.asGroupType().getType("element"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Javadoc for class ListConverter is not correct There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done in costimuraru#2 |
||
} | ||
} else { | ||
throw new ParquetDecodingException("Expected list but got: " + parquetType); | ||
} | ||
|
||
converter = newMessageConverter(parentBuilder, fieldDescriptor, parquetSchema); | ||
} | ||
|
||
@Override | ||
public Converter getConverter(int fieldIndex) { | ||
if (fieldIndex > 0) { | ||
throw new ParquetDecodingException("Unexpected multiple fields in the LIST wrapper"); | ||
} | ||
|
||
return new GroupConverter() { | ||
@Override | ||
public Converter getConverter(int fieldIndex) { | ||
return converter; | ||
} | ||
|
||
@Override | ||
public void start() { | ||
|
||
} | ||
|
||
@Override | ||
public void end() { | ||
|
||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public void start() { | ||
|
||
} | ||
|
||
@Override | ||
public void end() { | ||
|
||
} | ||
} | ||
|
||
|
||
final class MapConverter extends GroupConverter { | ||
private final Converter converter; | ||
|
||
public MapConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { | ||
OriginalType originalType = parquetType.getOriginalType(); | ||
if (originalType != OriginalType.MAP) { | ||
throw new ParquetDecodingException("Expected MAP wrapper. Found: " + originalType + " instead."); | ||
} | ||
|
||
Type parquetSchema; | ||
if (parquetType.asGroupType().containsField("key_value")){ | ||
parquetSchema = parquetType.asGroupType().getType("key_value"); | ||
} else { | ||
throw new ParquetDecodingException("Expected map but got: " + parquetType); | ||
} | ||
|
||
converter = newMessageConverter(parentBuilder, fieldDescriptor, parquetSchema); | ||
} | ||
|
||
@Override | ||
public Converter getConverter(int fieldIndex) { | ||
if (fieldIndex > 0) { | ||
throw new ParquetDecodingException("Unexpected multiple fields in the MAP wrapper"); | ||
} | ||
return converter; | ||
} | ||
|
||
@Override | ||
public void start() { | ||
} | ||
|
||
@Override | ||
public void end() { | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why there is condition ? When will be the original type null ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess for the data generated in previous version of parquet-protobuf, it is not having the "OriginalType" annotation for repeated fields, thus this conditional test to be backward compatible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe the reason for this is that if originalType is null, the swicth will throw an exception.
In costimuraru#2 I replaced the switch with an if.