From af977adc43a071a09652fea4ce3deba2d5b8d171 Mon Sep 17 00:00:00 2001 From: Masayuki Takahashi Date: Sat, 21 Apr 2018 14:58:35 +0100 Subject: [PATCH] PARQUET-1128: [Java] Upgrade the Apache Arrow version to 0.8.0 for SchemaConverter When I converted parquet(1.9.1-SNAPSHOT) schema to arrow(0.4.0) with SchemaConverter, this exception raised. ``` java.lang.NoClassDefFoundError: org/apache/arrow/vector/types/pojo/ArrowType$Struct_ at net.wrap_trap.parquet_arrow.ParquetToArrowConverter.convertToArrow(ParquetToArrowConverter.java:67) at net.wrap_trap.parquet_arrow.ParquetToArrowConverter.convertToArrow(ParquetToArrowConverter.java:40) at net.wrap_trap.parquet_arrow.ParquetToArrowConverterTest.parquetToArrowConverterTest(ParquetToArrowConverterTest.java:27) ``` This reason is that SchemaConverter refer to Apache Arrow 0.1.0. I upgrade the Apache Arrow version to 0.8.0(latest) for SchemaConverter. Author: Masayuki Takahashi Closes #443 from masayuki038/PARQUET-1128 and squashes the following commits: 8ba47813 [Masayuki Takahashi] PARQUET-1128: [Java] Upgrade the Apache Arrow version to 0.8.0 for SchemaConverter b80d793a [Masayuki Takahashi] PARQUET-1128: [Java] Upgrade the Apache Arrow version to 0.8.0 for SchemaConverter --- parquet-arrow/pom.xml | 2 +- .../parquet/arrow/schema/SchemaConverter.java | 227 ++++++------ .../arrow/schema/TestSchemaConverter.java | 344 +++++++++--------- 3 files changed, 299 insertions(+), 274 deletions(-) diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index de31e16b49..232167ecb3 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -33,7 +33,7 @@ https://parquet.apache.org - 0.1.0 + 0.8.0 diff --git a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java index cf4ec0d7d6..1d69c45231 100644 --- a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java +++ b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java @@ -47,8 +47,8 @@ import java.util.ArrayList; import java.util.List; -import org.apache.arrow.flatbuf.Precision; -import org.apache.arrow.flatbuf.TimeUnit; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; @@ -59,7 +59,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; import org.apache.arrow.vector.types.pojo.ArrowType.Null; -import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; @@ -141,13 +141,22 @@ public TypeMapping visit(Null type) { } @Override - public TypeMapping visit(Struct_ type) { + public TypeMapping visit(Struct type) { List parquetTypes = fromArrow(children); return new StructTypeMapping(field, addToBuilder(parquetTypes, Types.buildGroup(OPTIONAL)).named(fieldName), parquetTypes); } @Override public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + return createListTypeMapping(); + } + + @Override + public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) { + return createListTypeMapping(); + } + + private ListTypeMapping createListTypeMapping() { if (children.size() != 1) { throw new IllegalArgumentException("list fields must have exactly one child: " + field); } @@ -167,31 +176,31 @@ public TypeMapping visit(Union type) { public TypeMapping visit(Int type) { boolean signed = type.getIsSigned(); switch (type.getBitWidth()) { - case 8: - return primitive(INT32, signed ? INT_8 : UINT_8); - case 16: - return primitive(INT32, signed ? INT_16 : UINT_16); - case 32: - return primitive(INT32, signed ? INT_32 : UINT_32); - case 64: - return primitive(INT64, signed ? INT_64 : UINT_64); - default: - throw new IllegalArgumentException("Illegal int type: " + field); + case 8: + return primitive(INT32, signed ? INT_8 : UINT_8); + case 16: + return primitive(INT32, signed ? INT_16 : UINT_16); + case 32: + return primitive(INT32, signed ? INT_32 : UINT_32); + case 64: + return primitive(INT64, signed ? INT_64 : UINT_64); + default: + throw new IllegalArgumentException("Illegal int type: " + field); } } @Override public TypeMapping visit(FloatingPoint type) { switch (type.getPrecision()) { - case Precision.HALF: - // TODO(PARQUET-757): original type HalfFloat - return primitive(FLOAT); - case Precision.SINGLE: - return primitive(FLOAT); - case Precision.DOUBLE: - return primitive(DOUBLE); - default: - throw new IllegalArgumentException("Illegal float type: " + field); + case HALF: + // TODO(PARQUET-757): original type HalfFloat + return primitive(FLOAT); + case SINGLE: + return primitive(FLOAT); + case DOUBLE: + return primitive(DOUBLE); + default: + throw new IllegalArgumentException("Illegal float type: " + field); } } @@ -336,7 +345,7 @@ private TypeMapping fromParquetGroup(GroupType type, String name) { OriginalType ot = type.getOriginalType(); if (ot == null) { List typeMappings = fromParquet(type.getFields()); - Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new Struct_(), fields(typeMappings)); + Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new Struct(), fields(typeMappings)); return new StructTypeMapping(arrowField, type, typeMappings); } else { switch (ot) { @@ -366,12 +375,12 @@ private TypeMapping field(ArrowType arrowType) { @Override public TypeMapping convertFLOAT(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return field(new ArrowType.FloatingPoint(Precision.SINGLE)); + return field(new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)); } @Override public TypeMapping convertDOUBLE(PrimitiveTypeName primitiveTypeName) throws RuntimeException { - return field(new ArrowType.FloatingPoint(Precision.DOUBLE)); + return field(new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)); } @Override @@ -381,41 +390,41 @@ public TypeMapping convertINT32(PrimitiveTypeName primitiveTypeName) throws Runt return integer(32, true); } switch (ot) { - case INT_8: - return integer(8, true); - case INT_16: - return integer(16, true); - case INT_32: - return integer(32, true); - case UINT_8: - return integer(8, false); - case UINT_16: - return integer(16, false); - case UINT_32: - return integer(32, false); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - case DATE: - return field(new ArrowType.Date()); - case TIMESTAMP_MICROS: - return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND)); - case TIMESTAMP_MILLIS: - return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND)); - case TIME_MILLIS: - return field(new ArrowType.Time()); - default: - case TIME_MICROS: - case INT_64: - case UINT_64: - case UTF8: - case ENUM: - case BSON: - case INTERVAL: - case JSON: - case LIST: - case MAP: - case MAP_KEY_VALUE: - throw new IllegalArgumentException("illegal type " + type); + case INT_8: + return integer(8, true); + case INT_16: + return integer(16, true); + case INT_32: + return integer(32, true); + case UINT_8: + return integer(8, false); + case UINT_16: + return integer(16, false); + case UINT_32: + return integer(32, false); + case DECIMAL: + return decimal(type.getDecimalMetadata()); + case DATE: + return field(new ArrowType.Date(DateUnit.DAY)); + case TIMESTAMP_MICROS: + return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC")); + case TIMESTAMP_MILLIS: + return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")); + case TIME_MILLIS: + return field(new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32)); + default: + case TIME_MICROS: + case INT_64: + case UINT_64: + case UTF8: + case ENUM: + case BSON: + case INTERVAL: + case JSON: + case LIST: + case MAP: + case MAP_KEY_VALUE: + throw new IllegalArgumentException("illegal type " + type); } } @@ -426,43 +435,42 @@ public TypeMapping convertINT64(PrimitiveTypeName primitiveTypeName) throws Runt return integer(64, true); } switch (ot) { - case INT_8: - return integer(8, true); - case INT_16: - return integer(16, true); - case INT_32: - return integer(32, true); - case INT_64: - return integer(64, true); - case UINT_8: - return integer(8, false); - case UINT_16: - return integer(16, false); - case UINT_32: - return integer(32, false); - case UINT_64: - return integer(64, false); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - case DATE: - return field(new ArrowType.Date()); - case TIMESTAMP_MICROS: - return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND)); - case TIMESTAMP_MILLIS: - return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND)); - case TIME_MILLIS: - return field(new ArrowType.Time()); - default: - case TIME_MICROS: - case UTF8: - case ENUM: - case BSON: - case INTERVAL: - case JSON: - case LIST: - case MAP: - case MAP_KEY_VALUE: - throw new IllegalArgumentException("illegal type " + type); + case INT_8: + return integer(8, true); + case INT_16: + return integer(16, true); + case INT_32: + return integer(32, true); + case INT_64: + return integer(64, true); + case UINT_8: + return integer(8, false); + case UINT_16: + return integer(16, false); + case UINT_32: + return integer(32, false); + case UINT_64: + return integer(64, false); + case DECIMAL: + return decimal(type.getDecimalMetadata()); + case DATE: + return field(new ArrowType.Date(DateUnit.DAY)); + case TIMESTAMP_MICROS: + return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC")); + case TIMESTAMP_MILLIS: + return field(new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")); + default: + case TIME_MICROS: + case UTF8: + case ENUM: + case BSON: + case INTERVAL: + case JSON: + case LIST: + case MAP: + case MAP_KEY_VALUE: + case TIME_MILLIS: + throw new IllegalArgumentException("illegal type " + type); } } @@ -489,12 +497,12 @@ public TypeMapping convertBINARY(PrimitiveTypeName primitiveTypeName) throws Run return field(new ArrowType.Binary()); } switch (ot) { - case UTF8: - return field(new ArrowType.Utf8()); - case DECIMAL: - return decimal(type.getDecimalMetadata()); - default: - throw new IllegalArgumentException("illegal type " + type); + case UTF8: + return field(new ArrowType.Utf8()); + case DECIMAL: + return decimal(type.getDecimalMetadata()); + default: + throw new IllegalArgumentException("illegal type " + type); } } @@ -545,7 +553,7 @@ public TypeMapping visit(Null type) { } @Override - public TypeMapping visit(Struct_ type) { + public TypeMapping visit(Struct type) { if (parquetField.isPrimitive()) { throw new IllegalArgumentException("Parquet type not a group: " + parquetField); } @@ -555,6 +563,15 @@ public TypeMapping visit(Struct_ type) { @Override public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + return createListTypeMapping(type); + } + + @Override + public TypeMapping visit(org.apache.arrow.vector.types.pojo.ArrowType.FixedSizeList type) { + return createListTypeMapping(type); + } + + private TypeMapping createListTypeMapping(ArrowType.ComplexType type) { if (arrowField.getChildren().size() != 1) { throw new IllegalArgumentException("Invalid list type: " + type); } diff --git a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java index ec2b8074ce..654f773f9f 100644 --- a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java +++ b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java @@ -43,11 +43,11 @@ import java.io.IOException; import java.util.List; +import org.apache.arrow.vector.types.IntervalUnit; -import org.apache.arrow.flatbuf.IntervalUnit; -import org.apache.arrow.flatbuf.Precision; -import org.apache.arrow.flatbuf.TimeUnit; -import org.apache.arrow.flatbuf.UnionMode; +import org.apache.arrow.vector.types.UnionMode; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; @@ -79,159 +79,167 @@ private static Field field(String name, ArrowType type, Field... children) { } private final Schema complexArrowSchema = new Schema(asList( - field("a", false, new ArrowType.Int(8, true)), - field("b", new ArrowType.Struct_(), - field("c", new ArrowType.Int(16, true)), - field("d", new ArrowType.Utf8())), - field("e", new ArrowType.List(), field(null, new ArrowType.Date())), - field("f", new ArrowType.FloatingPoint(Precision.SINGLE)), - field("g", new ArrowType.Timestamp(TimeUnit.MILLISECOND)), - field("h", new ArrowType.Interval(IntervalUnit.DAY_TIME)) - )); + field("a", false, new ArrowType.Int(8, true)), + field("b", new ArrowType.Struct(), + field("c", new ArrowType.Int(16, true)), + field("d", new ArrowType.Utf8())), + field("e", new ArrowType.List(), field(null, new ArrowType.Date(DateUnit.DAY))), + field("f", new ArrowType.FixedSizeList(1), field(null, new ArrowType.Date(DateUnit.DAY))), + field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + field("h", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")), + field("i", new ArrowType.Interval(IntervalUnit.DAY_TIME)) + )); private final MessageType complexParquetSchema = Types.buildMessage() - .addField(Types.optional(INT32).as(INT_8).named("a")) - .addField(Types.optionalGroup() - .addField(Types.optional(INT32).as(INT_16).named("c")) - .addField(Types.optional(BINARY).as(UTF8).named("d")) - .named("b")) - .addField(Types.optionalList(). - setElementType(Types.optional(INT32).as(DATE).named("element")) - .named("e")) - .addField(Types.optional(FLOAT).named("f")) - .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("g")) - .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("h")) - .named("root"); + .addField(Types.optional(INT32).as(INT_8).named("a")) + .addField(Types.optionalGroup() + .addField(Types.optional(INT32).as(INT_16).named("c")) + .addField(Types.optional(BINARY).as(UTF8).named("d")) + .named("b")) + .addField(Types.optionalList(). + setElementType(Types.optional(INT32).as(DATE).named("element")) + .named("e")) + .addField(Types.optionalList(). + setElementType(Types.optional(INT32).as(DATE).named("element")) + .named("f")) + .addField(Types.optional(FLOAT).named("g")) + .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("h")) + .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("i")) + .named("root"); private final Schema allTypesArrowSchema = new Schema(asList( - field("a", false, new ArrowType.Null()), - field("b", new ArrowType.Struct_(), field("ba", new ArrowType.Null())), - field("c", new ArrowType.List(), field("ca", new ArrowType.Null())), - field("d", new ArrowType.Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new ArrowType.Null())), - field("e", new ArrowType.Int(8, true)), - field("e1", new ArrowType.Int(16, true)), - field("e2", new ArrowType.Int(32, true)), - field("e3", new ArrowType.Int(64, true)), - field("e4", new ArrowType.Int(8, false)), - field("e5", new ArrowType.Int(16, false)), - field("e6", new ArrowType.Int(32, false)), - field("e7", new ArrowType.Int(64, false)), - field("f", new ArrowType.FloatingPoint(Precision.SINGLE)), - field("f1", new ArrowType.FloatingPoint(Precision.DOUBLE)), - field("g", new ArrowType.Utf8()), - field("h", new ArrowType.Binary()), - field("i", new ArrowType.Bool()), - field("j", new ArrowType.Decimal(5, 5)), - field("j1", new ArrowType.Decimal(15, 5)), - field("j2", new ArrowType.Decimal(25, 5)), - field("k", new ArrowType.Date()), - field("l", new ArrowType.Time()), - field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND)), - field("n", new ArrowType.Interval(IntervalUnit.DAY_TIME)), - field("n1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH)) - )); + field("a", false, new ArrowType.Null()), + field("b", new ArrowType.Struct(), field("ba", new ArrowType.Null())), + field("c", new ArrowType.List(), field("ca", new ArrowType.Null())), + field("d", new ArrowType.FixedSizeList(1), field("da", new ArrowType.Null())), + field("e", new ArrowType.Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("ea", new ArrowType.Null())), + field("f", new ArrowType.Int(8, true)), + field("f1", new ArrowType.Int(16, true)), + field("f2", new ArrowType.Int(32, true)), + field("f3", new ArrowType.Int(64, true)), + field("f4", new ArrowType.Int(8, false)), + field("f5", new ArrowType.Int(16, false)), + field("f6", new ArrowType.Int(32, false)), + field("f7", new ArrowType.Int(64, false)), + field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + field("g1", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + field("h", new ArrowType.Utf8()), + field("i", new ArrowType.Binary()), + field("j", new ArrowType.Bool()), + field("k", new ArrowType.Decimal(5, 5)), + field("k1", new ArrowType.Decimal(15, 5)), + field("k2", new ArrowType.Decimal(25, 5)), + field("l", new ArrowType.Date(DateUnit.DAY)), + field("m", new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.SECOND, 32)), + field("n", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")), + field("o", new ArrowType.Interval(IntervalUnit.DAY_TIME)), + field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH)) + )); private final MessageType allTypesParquetSchema = Types.buildMessage() - .addField(Types.optional(BINARY).named("a")) - .addField(Types.optionalGroup() - .addField(Types.optional(BINARY).named("ba")) - .named("b")) - .addField(Types.optionalList(). - setElementType(Types.optional(BINARY).named("element")) - .named("c")) - .addField(Types.optionalGroup() - .addField(Types.optional(BINARY).named("da")) - .named("d")) - .addField(Types.optional(INT32).as(INT_8).named("e")) - .addField(Types.optional(INT32).as(INT_16).named("e1")) - .addField(Types.optional(INT32).as(INT_32).named("e2")) - .addField(Types.optional(INT64).as(INT_64).named("e3")) - .addField(Types.optional(INT32).as(UINT_8).named("e4")) - .addField(Types.optional(INT32).as(UINT_16).named("e5")) - .addField(Types.optional(INT32).as(UINT_32).named("e6")) - .addField(Types.optional(INT64).as(UINT_64).named("e7")) - .addField(Types.optional(FLOAT).named("f")) - .addField(Types.optional(DOUBLE).named("f1")) - .addField(Types.optional(BINARY).as(UTF8).named("g")) - .addField(Types.optional(BINARY).named("h")) - .addField(Types.optional(BOOLEAN).named("i")) - .addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("j")) - .addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("j1")) - .addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("j2")) - .addField(Types.optional(INT32).as(DATE).named("k")) - .addField(Types.optional(INT32).as(TIME_MILLIS).named("l")) - .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("m")) - .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("n")) - .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("n1")) - .named("root"); + .addField(Types.optional(BINARY).named("a")) + .addField(Types.optionalGroup() + .addField(Types.optional(BINARY).named("ba")) + .named("b")) + .addField(Types.optionalList(). + setElementType(Types.optional(BINARY).named("element")) + .named("c")) + .addField(Types.optionalList(). + setElementType(Types.optional(BINARY).named("element")) + .named("d")) + .addField(Types.optionalGroup() + .addField(Types.optional(BINARY).named("ea")) + .named("e")) + .addField(Types.optional(INT32).as(INT_8).named("f")) + .addField(Types.optional(INT32).as(INT_16).named("f1")) + .addField(Types.optional(INT32).as(INT_32).named("f2")) + .addField(Types.optional(INT64).as(INT_64).named("f3")) + .addField(Types.optional(INT32).as(UINT_8).named("f4")) + .addField(Types.optional(INT32).as(UINT_16).named("f5")) + .addField(Types.optional(INT32).as(UINT_32).named("f6")) + .addField(Types.optional(INT64).as(UINT_64).named("f7")) + .addField(Types.optional(FLOAT).named("g")) + .addField(Types.optional(DOUBLE).named("g1")) + .addField(Types.optional(BINARY).as(UTF8).named("h")) + .addField(Types.optional(BINARY).named("i")) + .addField(Types.optional(BOOLEAN).named("j")) + .addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("k")) + .addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("k1")) + .addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("k2")) + .addField(Types.optional(INT32).as(DATE).named("l")) + .addField(Types.optional(INT32).as(TIME_MILLIS).named("m")) + .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("n")) + .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o")) + .addField(Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("o1")) + .named("root"); private final Schema supportedTypesArrowSchema = new Schema(asList( - field("b", new ArrowType.Struct_(), field("ba", new ArrowType.Binary())), - field("c", new ArrowType.List(), field(null, new ArrowType.Binary())), - field("e", new ArrowType.Int(8, true)), - field("e1", new ArrowType.Int(16, true)), - field("e2", new ArrowType.Int(32, true)), - field("e3", new ArrowType.Int(64, true)), - field("e4", new ArrowType.Int(8, false)), - field("e5", new ArrowType.Int(16, false)), - field("e6", new ArrowType.Int(32, false)), - field("e7", new ArrowType.Int(64, false)), - field("f", new ArrowType.FloatingPoint(Precision.SINGLE)), - field("f1", new ArrowType.FloatingPoint(Precision.DOUBLE)), - field("g", new ArrowType.Utf8()), - field("h", new ArrowType.Binary()), - field("i", new ArrowType.Bool()), - field("j", new ArrowType.Decimal(5, 5)), - field("j1", new ArrowType.Decimal(15, 5)), - field("j2", new ArrowType.Decimal(25, 5)), - field("k", new ArrowType.Date()), - field("l", new ArrowType.Time()), - field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND)) - )); + field("b", new ArrowType.Struct(), field("ba", new ArrowType.Binary())), + field("c", new ArrowType.List(), field(null, new ArrowType.Binary())), + field("e", new ArrowType.Int(8, true)), + field("e1", new ArrowType.Int(16, true)), + field("e2", new ArrowType.Int(32, true)), + field("e3", new ArrowType.Int(64, true)), + field("e4", new ArrowType.Int(8, false)), + field("e5", new ArrowType.Int(16, false)), + field("e6", new ArrowType.Int(32, false)), + field("e7", new ArrowType.Int(64, false)), + field("f", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)), + field("f1", new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)), + field("g", new ArrowType.Utf8()), + field("h", new ArrowType.Binary()), + field("i", new ArrowType.Bool()), + field("j", new ArrowType.Decimal(5, 5)), + field("j1", new ArrowType.Decimal(15, 5)), + field("j2", new ArrowType.Decimal(25, 5)), + field("k", new ArrowType.Date(DateUnit.DAY)), + field("l", new ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32)), + field("m", new ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")) + )); private final MessageType supportedTypesParquetSchema = Types.buildMessage() - .addField(Types.optionalGroup() - .addField(Types.optional(BINARY).named("ba")) - .named("b")) - .addField(Types.optionalList(). - setElementType(Types.optional(BINARY).named("element")) - .named("c")) - .addField(Types.optional(INT32).as(INT_8).named("e")) - .addField(Types.optional(INT32).as(INT_16).named("e1")) - .addField(Types.optional(INT32).as(INT_32).named("e2")) - .addField(Types.optional(INT64).as(INT_64).named("e3")) - .addField(Types.optional(INT32).as(UINT_8).named("e4")) - .addField(Types.optional(INT32).as(UINT_16).named("e5")) - .addField(Types.optional(INT32).as(UINT_32).named("e6")) - .addField(Types.optional(INT64).as(UINT_64).named("e7")) - .addField(Types.optional(FLOAT).named("f")) - .addField(Types.optional(DOUBLE).named("f1")) - .addField(Types.optional(BINARY).as(UTF8).named("g")) - .addField(Types.optional(BINARY).named("h")) - .addField(Types.optional(BOOLEAN).named("i")) - .addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("j")) - .addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("j1")) - .addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("j2")) - .addField(Types.optional(INT32).as(DATE).named("k")) - .addField(Types.optional(INT32).as(TIME_MILLIS).named("l")) - .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("m")) - .named("root"); + .addField(Types.optionalGroup() + .addField(Types.optional(BINARY).named("ba")) + .named("b")) + .addField(Types.optionalList(). + setElementType(Types.optional(BINARY).named("element")) + .named("c")) + .addField(Types.optional(INT32).as(INT_8).named("e")) + .addField(Types.optional(INT32).as(INT_16).named("e1")) + .addField(Types.optional(INT32).as(INT_32).named("e2")) + .addField(Types.optional(INT64).as(INT_64).named("e3")) + .addField(Types.optional(INT32).as(UINT_8).named("e4")) + .addField(Types.optional(INT32).as(UINT_16).named("e5")) + .addField(Types.optional(INT32).as(UINT_32).named("e6")) + .addField(Types.optional(INT64).as(UINT_64).named("e7")) + .addField(Types.optional(FLOAT).named("f")) + .addField(Types.optional(DOUBLE).named("f1")) + .addField(Types.optional(BINARY).as(UTF8).named("g")) + .addField(Types.optional(BINARY).named("h")) + .addField(Types.optional(BOOLEAN).named("i")) + .addField(Types.optional(INT32).as(DECIMAL).precision(5).scale(5).named("j")) + .addField(Types.optional(INT64).as(DECIMAL).precision(15).scale(5).named("j1")) + .addField(Types.optional(BINARY).as(DECIMAL).precision(25).scale(5).named("j2")) + .addField(Types.optional(INT32).as(DATE).named("k")) + .addField(Types.optional(INT32).as(TIME_MILLIS).named("l")) + .addField(Types.optional(INT64).as(TIMESTAMP_MILLIS).named("m")) + .named("root"); private final Schema paperArrowSchema = new Schema(asList( - field("DocId", false, new ArrowType.Int(64, true)), - field("Links", new ArrowType.Struct_(), - field("Backward", false, new ArrowType.List(), field(null, false, new ArrowType.Int(64, true))), - field("Forward", false, new ArrowType.List(), field(null, false, new ArrowType.Int(64, true))) - ), - field("Name", false, new ArrowType.List(), - field(null, false, new ArrowType.Struct_(), - field("Language", false, new ArrowType.List(), - field(null, false, new ArrowType.Struct_(), - field("Code", false, new ArrowType.Binary()), - field("Country", new ArrowType.Binary()) - ) - ), - field("Url", new ArrowType.Binary()) + field("DocId", false, new ArrowType.Int(64, true)), + field("Links", new ArrowType.Struct(), + field("Backward", false, new ArrowType.List(), field(null, false, new ArrowType.Int(64, true))), + field("Forward", false, new ArrowType.List(), field(null, false, new ArrowType.Int(64, true))) + ), + field("Name", false, new ArrowType.List(), + field(null, false, new ArrowType.Struct(), + field("Language", false, new ArrowType.List(), + field(null, false, new ArrowType.Struct(), + field("Code", false, new ArrowType.Binary()), + field("Country", new ArrowType.Binary()) ) + ), + field("Url", new ArrowType.Binary()) ) + ) )); private SchemaConverter converter = new SchemaConverter(); @@ -286,7 +294,7 @@ private void compareFields(List left, List right) { @Test public void testAllMap() throws IOException { SchemaMapping map = converter.map(allTypesArrowSchema, allTypesParquetSchema); - Assert.assertEquals("p, s

, l

, u

, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p", toSummaryString(map)); + Assert.assertEquals("p, s

, l

, l

, u

, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p", toSummaryString(map)); } private String toSummaryString(SchemaMapping map) { @@ -301,32 +309,32 @@ private String toSummaryString(List fields) { sb.append(", "); } sb.append( - typeMapping.accept(new TypeMappingVisitor() { - @Override - public String visit(PrimitiveTypeMapping primitiveTypeMapping) { - return "p"; - } + typeMapping.accept(new TypeMappingVisitor() { + @Override + public String visit(PrimitiveTypeMapping primitiveTypeMapping) { + return "p"; + } - @Override - public String visit(StructTypeMapping structTypeMapping) { - return "s"; - } + @Override + public String visit(StructTypeMapping structTypeMapping) { + return "s"; + } - @Override - public String visit(UnionTypeMapping unionTypeMapping) { - return "u"; - } + @Override + public String visit(UnionTypeMapping unionTypeMapping) { + return "u"; + } - @Override - public String visit(ListTypeMapping listTypeMapping) { - return "l"; - } + @Override + public String visit(ListTypeMapping listTypeMapping) { + return "l"; + } - @Override - public String visit(RepeatedTypeMapping repeatedTypeMapping) { - return "r"; - } - }) + @Override + public String visit(RepeatedTypeMapping repeatedTypeMapping) { + return "r"; + } + }) ); if (typeMapping.getChildren() != null && !typeMapping.getChildren().isEmpty()) { sb.append("<").append(toSummaryString(typeMapping.getChildren())).append(">");