Skip to content

Commit f849384

Browse files
Constantin Murarujulienledem
authored andcommitted
PARQUET-968 Add Hive/Presto support in ProtoParquet
This PR adds Hive (https://github.com/apache/hive) and Presto (https://github.com/prestodb/presto) support for parquet messages written with ProtoParquetWriter. Hive and other tools, such as Presto (used by AWS Athena), rely on specific LIST/MAP wrappers (as defined in the parquet spec: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md). These wrappers are currently missing from the ProtoParquet schema. AvroParquet works just fine, because it adds these wrappers when it deals with arrays and maps. This PR brings these wrappers in parquet-proto, providing the same functionality that already exists in parquet-avro. This is backward compatible. Messages written without the extra LIST/MAP wrappers are still being read successfully using the updated ProtoParquetReader. Regarding the change. Given the following protobuf schema: ``` message ListOfPrimitives { repeated int64 my_repeated_id = 1; } ``` Old parquet schema was: ``` message ListOfPrimitives { repeated int64 my_repeated_id = 1; } ``` New parquet schema is: ``` message ListOfPrimitives { required group my_repeated_id (LIST) = 1 { repeated group list { required int64 element; } } } ``` --- For list of messages, the changes look like this: Protobuf schema: ``` message ListOfMessages { string top_field = 1; repeated MyInnerMessage first_array = 2; } message MyInnerMessage { int32 inner_field = 1; } ``` Old parquet schema was: ``` message TestProto3.ListOfMessages { optional binary top_field (UTF8) = 1; repeated group first_array = 2 { optional int32 inner_field = 1; } } ``` The expected parquet schema, compatible with Hive (and similar to parquet-avro) is the following (notice the LIST wrapper): ``` message TestProto3.ListOfMessages { optional binary top_field (UTF8) = 1; required group first_array (LIST) = 2 { repeated group list { optional group element { optional int32 inner_field = 1; } } } } ``` --- Similar for maps. Protobuf schema: ``` message TopMessage { map<int64, MyInnerMessage> myMap = 1; } message MyInnerMessage { int32 inner_field = 1; } ``` Old parquet schema: ``` message TestProto3.TopMessage { repeated group myMap = 1 { optional int64 key = 1; optional group value = 2 { optional int32 inner_field = 1; } } } ``` New parquet schema (notice the `MAP` wrapper): ``` message TestProto3.TopMessage { required group myMap (MAP) = 1 { repeated group key_value { required int64 key; optional group value { optional int32 inner_field = 1; } } } } ``` Jira: https://issues.apache.org/jira/browse/PARQUET-968 Author: Constantin Muraru <[email protected]> Author: Benoît Hanotte <[email protected]> Closes #411 from costimuraru/PARQUET-968 and squashes the following commits: 16eafcb [Benoît Hanotte] PARQUET-968 add proto flag to enable writing using specs-compliant schemas (#2) a8bd704 [Constantin Muraru] Pick up commit from @andredasilvapinto 5cf9248 [Constantin Muraru] PARQUET-968 Add Hive support in ProtoParquet
1 parent af977ad commit f849384

File tree

9 files changed

+1331
-84
lines changed

9 files changed

+1331
-84
lines changed

parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@
2424
import com.twitter.elephantbird.util.Protobufs;
2525
import org.apache.parquet.column.Dictionary;
2626
import org.apache.parquet.io.InvalidRecordException;
27+
import org.apache.parquet.io.ParquetDecodingException;
2728
import org.apache.parquet.io.api.Binary;
2829
import org.apache.parquet.io.api.Converter;
2930
import org.apache.parquet.io.api.GroupConverter;
3031
import org.apache.parquet.io.api.PrimitiveConverter;
3132
import org.apache.parquet.schema.GroupType;
3233
import org.apache.parquet.schema.IncompatibleSchemaModificationException;
34+
import org.apache.parquet.schema.OriginalType;
3335
import org.apache.parquet.schema.Type;
3436

3537
import java.util.HashMap;
@@ -126,10 +128,15 @@ public void add(Object value) {
126128
};
127129
}
128130

131+
if (OriginalType.LIST == parquetType.getOriginalType()) {
132+
return new ListConverter(parentBuilder, fieldDescriptor, parquetType);
133+
}
134+
if (OriginalType.MAP == parquetType.getOriginalType()) {
135+
return new MapConverter(parentBuilder, fieldDescriptor, parquetType);
136+
}
129137
return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType);
130138
}
131139

132-
133140
private Converter newScalarConverter(ParentValueContainer pvc, Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
134141

135142
JavaType javaType = fieldDescriptor.getJavaType();
@@ -342,4 +349,121 @@ public void addBinary(Binary binary) {
342349
}
343350

344351
}
352+
353+
/**
354+
* This class unwraps the additional LIST wrapper and makes it possible to read the underlying data and then convert
355+
* it to protobuf.
356+
* <p>
357+
* Consider the following protobuf schema:
358+
* message SimpleList {
359+
* repeated int64 first_array = 1;
360+
* }
361+
* <p>
362+
* A LIST wrapper is created in parquet for the above mentioned protobuf schema:
363+
* message SimpleList {
364+
* optional group first_array (LIST) = 1 {
365+
* repeated group list {
366+
* optional int32 element;
367+
* }
368+
* }
369+
* }
370+
* <p>
371+
* The LIST wrappers are used by 3rd party tools, such as Hive, to read parquet arrays. The wrapper contains
372+
* a repeated group named 'list', itself containing only one field called 'element' of the type of the repeated
373+
* object (can be a primitive as in this example or a group in case of a repeated message in protobuf).
374+
*/
375+
final class ListConverter extends GroupConverter {
376+
private final Converter converter;
377+
378+
public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
379+
OriginalType originalType = parquetType.getOriginalType();
380+
if (originalType != OriginalType.LIST || parquetType.isPrimitive()) {
381+
throw new ParquetDecodingException("Expected LIST wrapper. Found: " + originalType + " instead.");
382+
}
383+
384+
GroupType rootWrapperType = parquetType.asGroupType();
385+
if (!rootWrapperType.containsField("list") || rootWrapperType.getType("list").isPrimitive()) {
386+
throw new ParquetDecodingException("Expected repeated 'list' group inside LIST wrapperr but got: " + rootWrapperType);
387+
}
388+
389+
GroupType listType = rootWrapperType.getType("list").asGroupType();
390+
if (!listType.containsField("element")) {
391+
throw new ParquetDecodingException("Expected 'element' inside repeated list group but got: " + listType);
392+
}
393+
394+
Type elementType = listType.getType("element");
395+
converter = newMessageConverter(parentBuilder, fieldDescriptor, elementType);
396+
}
397+
398+
@Override
399+
public Converter getConverter(int fieldIndex) {
400+
if (fieldIndex > 0) {
401+
throw new ParquetDecodingException("Unexpected multiple fields in the LIST wrapper");
402+
}
403+
404+
return new GroupConverter() {
405+
@Override
406+
public Converter getConverter(int fieldIndex) {
407+
return converter;
408+
}
409+
410+
@Override
411+
public void start() {
412+
413+
}
414+
415+
@Override
416+
public void end() {
417+
418+
}
419+
};
420+
}
421+
422+
@Override
423+
public void start() {
424+
425+
}
426+
427+
@Override
428+
public void end() {
429+
430+
}
431+
}
432+
433+
434+
final class MapConverter extends GroupConverter {
435+
private final Converter converter;
436+
437+
public MapConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
438+
OriginalType originalType = parquetType.getOriginalType();
439+
if (originalType != OriginalType.MAP) {
440+
throw new ParquetDecodingException("Expected MAP wrapper. Found: " + originalType + " instead.");
441+
}
442+
443+
Type parquetSchema;
444+
if (parquetType.asGroupType().containsField("key_value")){
445+
parquetSchema = parquetType.asGroupType().getType("key_value");
446+
} else {
447+
throw new ParquetDecodingException("Expected map but got: " + parquetType);
448+
}
449+
450+
converter = newMessageConverter(parentBuilder, fieldDescriptor, parquetSchema);
451+
}
452+
453+
@Override
454+
public Converter getConverter(int fieldIndex) {
455+
if (fieldIndex > 0) {
456+
throw new ParquetDecodingException("Unexpected multiple fields in the MAP wrapper");
457+
}
458+
return converter;
459+
}
460+
461+
@Override
462+
public void start() {
463+
}
464+
465+
@Override
466+
public void end() {
467+
}
468+
}
345469
}
Lines changed: 133 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/*
1+
/*
22
* Licensed to the Apache Software Foundation (ASF) under one
33
* or more contributor license agreements. See the NOTICE file
44
* distributed with this work for additional information
@@ -18,36 +18,49 @@
1818
*/
1919
package org.apache.parquet.proto;
2020

21-
import static org.apache.parquet.schema.OriginalType.ENUM;
22-
import static org.apache.parquet.schema.OriginalType.UTF8;
23-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
24-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
25-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
26-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
27-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
28-
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
29-
30-
import java.util.List;
31-
21+
import com.google.protobuf.Descriptors;
22+
import com.google.protobuf.Descriptors.FieldDescriptor;
23+
import com.google.protobuf.Descriptors.FieldDescriptor.JavaType;
24+
import com.google.protobuf.Message;
25+
import com.twitter.elephantbird.util.Protobufs;
3226
import org.apache.parquet.schema.MessageType;
27+
import org.apache.parquet.schema.OriginalType;
28+
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
3329
import org.apache.parquet.schema.Type;
3430
import org.apache.parquet.schema.Types;
3531
import org.apache.parquet.schema.Types.Builder;
3632
import org.apache.parquet.schema.Types.GroupBuilder;
37-
38-
import com.google.protobuf.Descriptors;
39-
import com.google.protobuf.Descriptors.FieldDescriptor.JavaType;
40-
import com.google.protobuf.Message;
41-
import com.twitter.elephantbird.util.Protobufs;
4233
import org.slf4j.Logger;
4334
import org.slf4j.LoggerFactory;
4435

36+
import java.util.List;
37+
38+
import static org.apache.parquet.schema.OriginalType.ENUM;
39+
import static org.apache.parquet.schema.OriginalType.UTF8;
40+
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
41+
4542
/**
4643
* Converts a Protocol Buffer Descriptor into a Parquet schema.
4744
*/
4845
public class ProtoSchemaConverter {
4946

5047
private static final Logger LOG = LoggerFactory.getLogger(ProtoSchemaConverter.class);
48+
private final boolean parquetSpecsCompliant;
49+
50+
public ProtoSchemaConverter() {
51+
this(false);
52+
}
53+
54+
/**
55+
* Instanciate a schema converter to get the parquet schema corresponding to protobuf classes.
56+
* @param parquetSpecsCompliant If set to false, the parquet schema generated will be using the old
57+
* schema style (prior to PARQUET-968) to provide backward-compatibility
58+
* but which does not use LIST and MAP wrappers around collections as required
59+
* by the parquet specifications. If set to true, specs compliant schemas are used.
60+
*/
61+
public ProtoSchemaConverter(boolean parquetSpecsCompliant) {
62+
this.parquetSpecsCompliant = parquetSpecsCompliant;
63+
}
5164

5265
public MessageType convert(Class<? extends Message> protobufClass) {
5366
LOG.debug("Converting protocol buffer class \"" + protobufClass + "\" to parquet schema.");
@@ -60,8 +73,8 @@ public MessageType convert(Class<? extends Message> protobufClass) {
6073
}
6174

6275
/* Iterates over list of fields. **/
63-
private <T> GroupBuilder<T> convertFields(GroupBuilder<T> groupBuilder, List<Descriptors.FieldDescriptor> fieldDescriptors) {
64-
for (Descriptors.FieldDescriptor fieldDescriptor : fieldDescriptors) {
76+
private <T> GroupBuilder<T> convertFields(GroupBuilder<T> groupBuilder, List<FieldDescriptor> fieldDescriptors) {
77+
for (FieldDescriptor fieldDescriptor : fieldDescriptors) {
6578
groupBuilder =
6679
addField(fieldDescriptor, groupBuilder)
6780
.id(fieldDescriptor.getNumber())
@@ -70,7 +83,7 @@ private <T> GroupBuilder<T> convertFields(GroupBuilder<T> groupBuilder, List<Des
7083
return groupBuilder;
7184
}
7285

73-
private Type.Repetition getRepetition(Descriptors.FieldDescriptor descriptor) {
86+
private Type.Repetition getRepetition(FieldDescriptor descriptor) {
7487
if (descriptor.isRequired()) {
7588
return Type.Repetition.REQUIRED;
7689
} else if (descriptor.isRepeated()) {
@@ -80,26 +93,110 @@ private Type.Repetition getRepetition(Descriptors.FieldDescriptor descriptor) {
8093
}
8194
}
8295

83-
private <T> Builder<? extends Builder<?, GroupBuilder<T>>, GroupBuilder<T>> addField(Descriptors.FieldDescriptor descriptor, GroupBuilder<T> builder) {
84-
Type.Repetition repetition = getRepetition(descriptor);
85-
JavaType javaType = descriptor.getJavaType();
96+
private <T> Builder<? extends Builder<?, GroupBuilder<T>>, GroupBuilder<T>> addField(FieldDescriptor descriptor, final GroupBuilder<T> builder) {
97+
if (descriptor.getJavaType() == JavaType.MESSAGE) {
98+
return addMessageField(descriptor, builder);
99+
}
100+
101+
ParquetType parquetType = getParquetType(descriptor);
102+
if (descriptor.isRepeated() && parquetSpecsCompliant) {
103+
// the old schema style did not include the LIST wrapper around repeated fields
104+
return addRepeatedPrimitive(descriptor, parquetType.primitiveType, parquetType.originalType, builder);
105+
}
106+
107+
return builder.primitive(parquetType.primitiveType, getRepetition(descriptor)).as(parquetType.originalType);
108+
}
109+
110+
private <T> Builder<? extends Builder<?, GroupBuilder<T>>, GroupBuilder<T>> addRepeatedPrimitive(FieldDescriptor descriptor,
111+
PrimitiveTypeName primitiveType,
112+
OriginalType originalType,
113+
final GroupBuilder<T> builder) {
114+
return builder
115+
.group(Type.Repetition.OPTIONAL).as(OriginalType.LIST)
116+
.group(Type.Repetition.REPEATED)
117+
.primitive(primitiveType, Type.Repetition.REQUIRED).as(originalType)
118+
.named("element")
119+
.named("list");
120+
}
121+
122+
private <T> GroupBuilder<GroupBuilder<T>> addRepeatedMessage(FieldDescriptor descriptor, GroupBuilder<T> builder) {
123+
GroupBuilder<GroupBuilder<GroupBuilder<GroupBuilder<T>>>> result =
124+
builder
125+
.group(Type.Repetition.OPTIONAL).as(OriginalType.LIST)
126+
.group(Type.Repetition.REPEATED)
127+
.group(Type.Repetition.OPTIONAL);
128+
129+
convertFields(result, descriptor.getMessageType().getFields());
130+
131+
return result.named("element").named("list");
132+
}
133+
134+
private <T> GroupBuilder<GroupBuilder<T>> addMessageField(FieldDescriptor descriptor, final GroupBuilder<T> builder) {
135+
if (descriptor.isMapField() && parquetSpecsCompliant) {
136+
// the old schema style did not include the MAP wrapper around map groups
137+
return addMapField(descriptor, builder);
138+
}
139+
if (descriptor.isRepeated() && parquetSpecsCompliant) {
140+
// the old schema style did not include the LIST wrapper around repeated messages
141+
return addRepeatedMessage(descriptor, builder);
142+
}
143+
144+
// Plain message
145+
GroupBuilder<GroupBuilder<T>> group = builder.group(getRepetition(descriptor));
146+
convertFields(group, descriptor.getMessageType().getFields());
147+
return group;
148+
}
149+
150+
private <T> GroupBuilder<GroupBuilder<T>> addMapField(FieldDescriptor descriptor, final GroupBuilder<T> builder) {
151+
List<FieldDescriptor> fields = descriptor.getMessageType().getFields();
152+
if (fields.size() != 2) {
153+
throw new UnsupportedOperationException("Expected two fields for the map (key/value), but got: " + fields);
154+
}
155+
156+
ParquetType mapKeyParquetType = getParquetType(fields.get(0));
157+
158+
GroupBuilder<GroupBuilder<GroupBuilder<T>>> group = builder
159+
.group(Type.Repetition.OPTIONAL).as(OriginalType.MAP) // only optional maps are allowed in Proto3
160+
.group(Type.Repetition.REPEATED) // key_value wrapper
161+
.primitive(mapKeyParquetType.primitiveType, Type.Repetition.REQUIRED).as(mapKeyParquetType.originalType).named("key");
162+
163+
return addField(fields.get(1), group).named("value")
164+
.named("key_value");
165+
}
166+
167+
private ParquetType getParquetType(FieldDescriptor fieldDescriptor) {
168+
169+
JavaType javaType = fieldDescriptor.getJavaType();
86170
switch (javaType) {
87-
case BOOLEAN: return builder.primitive(BOOLEAN, repetition);
88-
case INT: return builder.primitive(INT32, repetition);
89-
case LONG: return builder.primitive(INT64, repetition);
90-
case FLOAT: return builder.primitive(FLOAT, repetition);
91-
case DOUBLE: return builder.primitive(DOUBLE, repetition);
92-
case BYTE_STRING: return builder.primitive(BINARY, repetition);
93-
case STRING: return builder.primitive(BINARY, repetition).as(UTF8);
94-
case MESSAGE: {
95-
GroupBuilder<GroupBuilder<T>> group = builder.group(repetition);
96-
convertFields(group, descriptor.getMessageType().getFields());
97-
return group;
98-
}
99-
case ENUM: return builder.primitive(BINARY, repetition).as(ENUM);
171+
case INT: return ParquetType.of(INT32);
172+
case LONG: return ParquetType.of(INT64);
173+
case DOUBLE: return ParquetType.of(DOUBLE);
174+
case BOOLEAN: return ParquetType.of(BOOLEAN);
175+
case FLOAT: return ParquetType.of(FLOAT);
176+
case STRING: return ParquetType.of(BINARY, UTF8);
177+
case ENUM: return ParquetType.of(BINARY, ENUM);
178+
case BYTE_STRING: return ParquetType.of(BINARY);
100179
default:
101180
throw new UnsupportedOperationException("Cannot convert Protocol Buffer: unknown type " + javaType);
102181
}
103182
}
104183

184+
private static class ParquetType {
185+
PrimitiveTypeName primitiveType;
186+
OriginalType originalType;
187+
188+
private ParquetType(PrimitiveTypeName primitiveType, OriginalType originalType) {
189+
this.primitiveType = primitiveType;
190+
this.originalType = originalType;
191+
}
192+
193+
public static ParquetType of(PrimitiveTypeName primitiveType, OriginalType originalType) {
194+
return new ParquetType(primitiveType, originalType);
195+
}
196+
197+
public static ParquetType of(PrimitiveTypeName primitiveType) {
198+
return of(primitiveType, null);
199+
}
200+
}
201+
105202
}

0 commit comments

Comments
 (0)