From e9305bd0a79b3bb03f1bd9b646581951fd571cd6 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 26 Jul 2015 22:52:28 +0800 Subject: [PATCH 01/21] Refactors Parquet write support to follow Parquet format spec --- .../sql/parquet/CatalystReadSupport.scala | 4 +- .../sql/parquet/CatalystWriteSupport.scala | 307 +++++++++++++++++ .../spark/sql/parquet/ParquetRelation.scala | 32 +- .../sql/parquet/ParquetTableSupport.scala | 322 ------------------ 4 files changed, 321 insertions(+), 344 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala index 975fec101d9c2..9648035744c1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala @@ -64,7 +64,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst // schema of this file from its the metadata. - val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA)) + val maybeRowSchema = Option(conf.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)) // Optional schema of requested columns, in the form of a string serialized from a Catalyst // `StructType` containing all requested columns. @@ -139,7 +139,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with val metadata = Map.empty[String, String] ++ maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ - maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _) + maybeRowSchema.map(CatalystWriteSupport.SPARK_ROW_SCHEMA -> _) logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema") new ReadContext(parquetRequestedSchema, metadata) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala new file mode 100644 index 0000000000000..8410ae26f8705 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.parquet + +import java.nio.{ByteBuffer, ByteOrder} + +import scala.collection.JavaConverters.mapAsJavaMapConverter + +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.column.ParquetProperties +import org.apache.parquet.hadoop.ParquetOutputFormat +import org.apache.parquet.hadoop.api.WriteSupport +import org.apache.parquet.hadoop.api.WriteSupport.WriteContext +import org.apache.parquet.io.api.{Binary, RecordConsumer} + +import org.apache.spark.Logging +import org.apache.spark.sql.SQLConf +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.types._ + +private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { + type ValueConsumer = (InternalRow, Int) => Unit + + private var schema: StructType = _ + + private var recordConsumer: RecordConsumer = _ + + private var followParquetFormatSpec: Boolean = _ + + // Byte array used to write timestamps as Parquet INT96 values + private val timestampBuffer = new Array[Byte](12) + + // Byte array used to write decimal values + private val decimalBuffer = new Array[Byte](8) + + override def init(configuration: Configuration): WriteContext = { + val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA) + schema = StructType.fromString(schemaString) + + assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null) + followParquetFormatSpec = + configuration.getBoolean( + SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, + SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get) + + val messageType = new CatalystSchemaConverter(configuration).convert(schema) + val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava + + logDebug( + s"""Initialized Parquet WriteSupport with Catalyst schema: + |${schema.prettyJson} + |and corresponding Parquet message type: + |$messageType + """.stripMargin) + + new WriteContext(messageType, metadata) + } + + override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { + this.recordConsumer = recordConsumer + } + + override def write(row: InternalRow): Unit = { + assert(row.numFields == schema.length) + recordConsumer.startMessage() + writeFields(row) + recordConsumer.endMessage() + } + + private def writeFields(row: InternalRow): Unit = { + val consumers = schema.map(_.dataType).map(makeConsumer) + var i = 0 + + while (i < row.numFields) { + if (!row.isNullAt(i)) { + consumeField(schema(i).name, i) { + consumers(i).apply(row, i) + } + } + + i += 1 + } + } + + private def makeConsumer(dataType: DataType): ValueConsumer = { + dataType match { + case BooleanType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addBoolean(row.getBoolean(ordinal)) + + case ByteType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addInteger(row.getByte(ordinal)) + + case ShortType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addInteger(row.getShort(ordinal)) + + case IntegerType | DateType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addInteger(row.getInt(ordinal)) + + case LongType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addLong(row.getLong(ordinal)) + + case FloatType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addFloat(row.getFloat(ordinal)) + + case DoubleType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addDouble(row.getDouble(ordinal)) + + case StringType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes)) + + case TimestampType => + (row: InternalRow, ordinal: Int) => { + val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal)) + val buf = ByteBuffer.wrap(timestampBuffer) + buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay) + recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer)) + } + + case BinaryType => + (row: InternalRow, ordinal: Int) => + recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) + + case DecimalType.Unlimited => + sys.error(s"Unsupported data type $dataType. Decimal precision must be specified.") + + case DecimalType.Fixed(precision, _) if precision > 18 => + sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.") + + case DecimalType.Fixed(precision) => + (row: InternalRow, ordinal: Int) => { + val decimal = row.getDecimal(ordinal) + val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision) + val unscaledLong = decimal.toUnscaledLong + + var i = 0 + var shift = 8 * (numBytes - 1) + + while (i < numBytes) { + decimalBuffer(i) = (unscaledLong >> shift).toByte + i += 1 + shift -= 8 + } + + recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) + } + + case StructType(fields) => + (row: InternalRow, ordinal: Int) => + consumeGroup(writeFields(row.getStruct(ordinal, fields.length))) + + case arrayType: ArrayType if followParquetFormatSpec => + makeStandardArrayConsumer(arrayType.elementType) + + case arrayType: ArrayType if !followParquetFormatSpec => + makeLegacyArrayConsumer(arrayType.elementType, arrayType.containsNull) + + case mapType: MapType if followParquetFormatSpec => + makeMapConsumer(mapType.keyType, mapType.valueType, "key_value") + + case mapType: MapType if !followParquetFormatSpec => + makeMapConsumer(mapType.keyType, mapType.valueType, "map") + + case _ => + sys.error(s"Unsupported data type $dataType.") + } + } + + private def makeStandardArrayConsumer(elementType: DataType): ValueConsumer = { + makeThreeLevelArrayConsumer(elementType, "list", "element") + } + + private def makeLegacyArrayConsumer( + elementType: DataType, + containsNull: Boolean): ValueConsumer = { + if (containsNull) { + makeThreeLevelArrayConsumer(elementType, "bag", "array") + } else { + makeTwoLevelArrayConsumer(elementType, "array") + } + } + + private def makeThreeLevelArrayConsumer( + elementType: DataType, + repeatedGroupName: String, + elementFieldName: String): ValueConsumer = { + val elementConsumer = makeConsumer(elementType) + val mutableRow = new SpecificMutableRow(elementType :: Nil) + + (row: InternalRow, ordinal: Int) => { + consumeGroup { + consumeField(repeatedGroupName, 0) { + val array = row.get(ordinal).asInstanceOf[Array[_]] + var i = 0 + + while (i < array.length) { + consumeGroup { + if (array(i) != null) { + mutableRow.update(0, array(i)) + consumeField(elementFieldName, 0)(elementConsumer.apply(mutableRow, 0)) + } + } + + i += 1 + } + } + } + } + } + + private def makeTwoLevelArrayConsumer( + elementType: DataType, + repeatedFieldName: String): ValueConsumer = { + val elementConsumer = makeConsumer(elementType) + val mutableRow = new SpecificMutableRow(elementType :: Nil) + + (row: InternalRow, ordinal: Int) => { + consumeGroup { + consumeField(repeatedFieldName, 0) { + val array = row.get(ordinal).asInstanceOf[Array[_]] + var i = 0 + + while (i < array.length) { + mutableRow.update(0, array(i)) + elementConsumer.apply(mutableRow, 0) + i += 1 + } + } + } + } + } + + private def makeMapConsumer( + keyType: DataType, + valueType: DataType, + repeatedGroupName: String): ValueConsumer = { + val keyConsumer = makeConsumer(keyType) + val valueConsumer = makeConsumer(valueType) + val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) + + (row: InternalRow, ordinal: Int) => { + consumeGroup { + consumeField(repeatedGroupName, 0) { + val map = row.get(ordinal).asInstanceOf[Map[_, _]] + for ((key, value) <- map) { + consumeGroup { + mutableRow.update(0, key) + consumeField("key", 0)(keyConsumer.apply(mutableRow, 0)) + if (value != null) { + mutableRow.update(1, value) + consumeField("value", 1)(valueConsumer.apply(mutableRow, 1)) + } + } + } + } + } + } + } + + private def consumeGroup(f: => Unit): Unit = { + recordConsumer.startGroup() + f + recordConsumer.endGroup() + } + + private def consumeField(field: String, index: Int)(f: => Unit): Unit = { + recordConsumer.startField(field, index) + f + recordConsumer.endField(field, index) + } +} + +private[parquet] object CatalystWriteSupport { + val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes" + + def setSchema(schema: StructType, configuration: Configuration): Unit = { + schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName) + configuration.set(SPARK_ROW_SCHEMA, schema.json) + configuration.set( + ParquetOutputFormat.WRITER_VERSION, + ParquetProperties.WriterVersion.PARQUET_1_0.toString) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index b4337a48dbd80..2d98792385664 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -37,16 +37,17 @@ import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetRecordReader, _ import org.apache.parquet.schema.MessageType import org.apache.parquet.{Log => ParquetLog} -import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD} import org.apache.spark.rdd.RDD._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionSpec +import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{SerializableConfiguration, Utils} +import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} private[sql] class DefaultSource extends HadoopFsRelationProvider { @@ -228,18 +229,13 @@ private[sql] class ParquetRelation( // bundled with `ParquetOutputFormat[Row]`. job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]]) - // TODO There's no need to use two kinds of WriteSupport - // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and - // complex types. - val writeSupportClass = - if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) { - classOf[MutableRowWriteSupport] - } else { - classOf[RowWriteSupport] - } + ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport]) + CatalystWriteSupport.setSchema(dataSchema, conf) - ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass) - RowWriteSupport.setSchema(dataSchema.toAttributes, conf) + // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema) + conf.set( + SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, + sqlContext.conf.followParquetFormatSpec.toString) // Sets compression scheme conf.set( @@ -267,7 +263,6 @@ private[sql] class ParquetRelation( val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp - val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec // Create the function to set variable Parquet confs at both driver and executor side. val initLocalJobFuncOpt = @@ -278,8 +273,7 @@ private[sql] class ParquetRelation( useMetadataCache, parquetFilterPushDown, assumeBinaryIsString, - assumeInt96IsTimestamp, - followParquetFormatSpec) _ + assumeInt96IsTimestamp) _ // Create the function to set input paths at the driver side. val setInputPaths = ParquetRelation.initializeDriverSideJobFunc(inputFiles) _ @@ -479,8 +473,7 @@ private[sql] object ParquetRelation extends Logging { useMetadataCache: Boolean, parquetFilterPushDown: Boolean, assumeBinaryIsString: Boolean, - assumeInt96IsTimestamp: Boolean, - followParquetFormatSpec: Boolean)(job: Job): Unit = { + assumeInt96IsTimestamp: Boolean)(job: Job): Unit = { val conf = job.getConfiguration conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName) @@ -501,16 +494,15 @@ private[sql] object ParquetRelation extends Logging { }) conf.set( - RowWriteSupport.SPARK_ROW_SCHEMA, + CatalystWriteSupport.SPARK_ROW_SCHEMA, CatalystSchemaConverter.checkFieldNames(dataSchema).json) // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache) - // Sets flags for Parquet schema conversion + // Sets flags for Parquet schema converter (converting Parquet schema to Catalyst schema) conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString) conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp) - conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec) } /** This closure sets input paths at the driver side. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala deleted file mode 100644 index 9cd0250f9c510..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.math.BigInteger -import java.nio.{ByteBuffer, ByteOrder} -import java.util.{HashMap => JHashMap} - -import org.apache.hadoop.conf.Configuration -import org.apache.parquet.column.ParquetProperties -import org.apache.parquet.hadoop.ParquetOutputFormat -import org.apache.parquet.hadoop.api.WriteSupport -import org.apache.parquet.io.api._ - -import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String - -/** - * A `parquet.hadoop.api.WriteSupport` for Row objects. - */ -private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Logging { - - private[parquet] var writer: RecordConsumer = null - private[parquet] var attributes: Array[Attribute] = null - - override def init(configuration: Configuration): WriteSupport.WriteContext = { - val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA) - val metadata = new JHashMap[String, String]() - metadata.put(CatalystReadSupport.SPARK_METADATA_KEY, origAttributesStr) - - if (attributes == null) { - attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray - } - - log.debug(s"write support initialized for requested schema $attributes") - ParquetRelation.enableLogForwarding() - new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata) - } - - override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { - writer = recordConsumer - log.debug(s"preparing for write with schema $attributes") - } - - override def write(record: InternalRow): Unit = { - val attributesSize = attributes.size - if (attributesSize > record.numFields) { - throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " + - s"($attributesSize > ${record.numFields})") - } - - var index = 0 - writer.startMessage() - while(index < attributesSize) { - // null values indicate optional fields but we do not check currently - if (!record.isNullAt(index)) { - writer.startField(attributes(index).name, index) - writeValue(attributes(index).dataType, record.get(index, attributes(index).dataType)) - writer.endField(attributes(index).name, index) - } - index = index + 1 - } - writer.endMessage() - } - - private[parquet] def writeValue(schema: DataType, value: Any): Unit = { - if (value != null) { - schema match { - case t: UserDefinedType[_] => writeValue(t.sqlType, value) - case t @ ArrayType(_, _) => writeArray( - t, - value.asInstanceOf[CatalystConverter.ArrayScalaType]) - case t @ MapType(_, _, _) => writeMap( - t, - value.asInstanceOf[CatalystConverter.MapScalaType]) - case t @ StructType(_) => writeStruct( - t, - value.asInstanceOf[CatalystConverter.StructScalaType]) - case _ => writePrimitive(schema.asInstanceOf[AtomicType], value) - } - } - } - - private[parquet] def writePrimitive(schema: DataType, value: Any): Unit = { - if (value != null) { - schema match { - case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean]) - case ByteType => writer.addInteger(value.asInstanceOf[Byte]) - case ShortType => writer.addInteger(value.asInstanceOf[Short]) - case IntegerType | DateType => writer.addInteger(value.asInstanceOf[Int]) - case LongType => writer.addLong(value.asInstanceOf[Long]) - case TimestampType => writeTimestamp(value.asInstanceOf[Long]) - case FloatType => writer.addFloat(value.asInstanceOf[Float]) - case DoubleType => writer.addDouble(value.asInstanceOf[Double]) - case StringType => writer.addBinary( - Binary.fromByteArray(value.asInstanceOf[UTF8String].getBytes)) - case BinaryType => writer.addBinary( - Binary.fromByteArray(value.asInstanceOf[Array[Byte]])) - case DecimalType.Fixed(precision, _) => - writeDecimal(value.asInstanceOf[Decimal], precision) - case _ => sys.error(s"Do not know how to writer $schema to consumer") - } - } - } - - private[parquet] def writeStruct( - schema: StructType, - struct: CatalystConverter.StructScalaType): Unit = { - if (struct != null) { - val fields = schema.fields.toArray - writer.startGroup() - var i = 0 - while(i < fields.length) { - if (!struct.isNullAt(i)) { - writer.startField(fields(i).name, i) - writeValue(fields(i).dataType, struct.get(i, fields(i).dataType)) - writer.endField(fields(i).name, i) - } - i = i + 1 - } - writer.endGroup() - } - } - - private[parquet] def writeArray( - schema: ArrayType, - array: CatalystConverter.ArrayScalaType): Unit = { - val elementType = schema.elementType - writer.startGroup() - if (array.numElements() > 0) { - if (schema.containsNull) { - writer.startField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0) - var i = 0 - while (i < array.numElements()) { - writer.startGroup() - if (!array.isNullAt(i)) { - writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0) - writeValue(elementType, array.get(i, elementType)) - writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0) - } - writer.endGroup() - i = i + 1 - } - writer.endField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0) - } else { - writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0) - var i = 0 - while (i < array.numElements()) { - writeValue(elementType, array.get(i, elementType)) - i = i + 1 - } - writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0) - } - } - writer.endGroup() - } - - private[parquet] def writeMap( - schema: MapType, - map: CatalystConverter.MapScalaType): Unit = { - writer.startGroup() - val length = map.numElements() - if (length > 0) { - writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0) - map.foreach(schema.keyType, schema.valueType, (key, value) => { - writer.startGroup() - writer.startField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0) - writeValue(schema.keyType, key) - writer.endField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0) - if (value != null) { - writer.startField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1) - writeValue(schema.valueType, value) - writer.endField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1) - } - writer.endGroup() - }) - writer.endField(CatalystConverter.MAP_SCHEMA_NAME, 0) - } - writer.endGroup() - } - - // Scratch array used to write decimals as fixed-length byte array - private[this] var reusableDecimalBytes = new Array[Byte](16) - - private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = { - val numBytes = CatalystSchemaConverter.minBytesForPrecision(precision) - - def longToBinary(unscaled: Long): Binary = { - var i = 0 - var shift = 8 * (numBytes - 1) - while (i < numBytes) { - reusableDecimalBytes(i) = (unscaled >> shift).toByte - i += 1 - shift -= 8 - } - Binary.fromByteArray(reusableDecimalBytes, 0, numBytes) - } - - def bigIntegerToBinary(unscaled: BigInteger): Binary = { - unscaled.toByteArray match { - case bytes if bytes.length == numBytes => - Binary.fromByteArray(bytes) - - case bytes if bytes.length <= reusableDecimalBytes.length => - val signedByte = (if (bytes.head < 0) -1 else 0).toByte - java.util.Arrays.fill(reusableDecimalBytes, 0, numBytes - bytes.length, signedByte) - System.arraycopy(bytes, 0, reusableDecimalBytes, numBytes - bytes.length, bytes.length) - Binary.fromByteArray(reusableDecimalBytes, 0, numBytes) - - case bytes => - reusableDecimalBytes = new Array[Byte](bytes.length) - bigIntegerToBinary(unscaled) - } - } - - val binary = if (numBytes <= 8) { - longToBinary(decimal.toUnscaledLong) - } else { - bigIntegerToBinary(decimal.toJavaBigDecimal.unscaledValue()) - } - - writer.addBinary(binary) - } - - // array used to write Timestamp as Int96 (fixed-length binary) - private[this] val int96buf = new Array[Byte](12) - - private[parquet] def writeTimestamp(ts: Long): Unit = { - val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(ts) - val buf = ByteBuffer.wrap(int96buf) - buf.order(ByteOrder.LITTLE_ENDIAN) - buf.putLong(timeOfDayNanos) - buf.putInt(julianDay) - writer.addBinary(Binary.fromByteArray(int96buf)) - } -} - -// Optimized for non-nested rows -private[parquet] class MutableRowWriteSupport extends RowWriteSupport { - override def write(record: InternalRow): Unit = { - val attributesSize = attributes.size - if (attributesSize > record.numFields) { - throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " + - s"($attributesSize > ${record.numFields})") - } - - var index = 0 - writer.startMessage() - while(index < attributesSize) { - // null values indicate optional fields but we do not check currently - if (!record.isNullAt(index) && !record.isNullAt(index)) { - writer.startField(attributes(index).name, index) - consumeType(attributes(index).dataType, record, index) - writer.endField(attributes(index).name, index) - } - index = index + 1 - } - writer.endMessage() - } - - private def consumeType( - ctype: DataType, - record: InternalRow, - index: Int): Unit = { - ctype match { - case BooleanType => writer.addBoolean(record.getBoolean(index)) - case ByteType => writer.addInteger(record.getByte(index)) - case ShortType => writer.addInteger(record.getShort(index)) - case IntegerType | DateType => writer.addInteger(record.getInt(index)) - case LongType => writer.addLong(record.getLong(index)) - case TimestampType => writeTimestamp(record.getLong(index)) - case FloatType => writer.addFloat(record.getFloat(index)) - case DoubleType => writer.addDouble(record.getDouble(index)) - case StringType => - writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes)) - case BinaryType => - writer.addBinary(Binary.fromByteArray(record.getBinary(index))) - case DecimalType.Fixed(precision, scale) => - writeDecimal(record.getDecimal(index, precision, scale), precision) - case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer") - } - } -} - -private[parquet] object RowWriteSupport { - val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes" - - def getSchema(configuration: Configuration): Seq[Attribute] = { - val schemaString = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA) - if (schemaString == null) { - throw new RuntimeException("Missing schema!") - } - ParquetTypesConverter.convertFromString(schemaString) - } - - def setSchema(schema: Seq[Attribute], configuration: Configuration) { - val encoded = ParquetTypesConverter.convertToString(schema) - configuration.set(SPARK_ROW_SCHEMA, encoded) - configuration.set( - ParquetOutputFormat.WRITER_VERSION, - ParquetProperties.WriterVersion.PARQUET_1_0.toString) - } -} From b465661b09cd34581babddeccafb4ea887777d34 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 27 Jul 2015 01:36:09 +0800 Subject: [PATCH 02/21] Cleans up code only used by old RowWriteSupport --- .../sql/parquet/CatalystSchemaConverter.scala | 2 +- .../sql/parquet/CatalystWriteSupport.scala | 15 +- .../spark/sql/parquet/ParquetConverter.scala | 39 ----- .../spark/sql/parquet/ParquetTypes.scala | 159 ------------------ .../parquet/ParquetCompatibilityTest.scala | 1 + .../spark/sql/parquet/ParquetIOSuite.scala | 11 +- .../sql/parquet/ParquetSchemaSuite.scala | 4 +- .../spark/sql/parquet/ParquetTest.scala | 35 ++++ 8 files changed, 57 insertions(+), 209 deletions(-) delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index d43ca95b4eea0..41b3c9d73e0af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -453,7 +453,7 @@ private[parquet] class CatalystSchemaConverter( .buildGroup(REPEATED) // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version) .addField(convertField(StructField("array_element", elementType, nullable))) - .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME)) + .named("bag")) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 8410ae26f8705..91768e5b59bcb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -151,10 +151,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi case DecimalType.Fixed(precision, _) if precision > 18 => sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.") - case DecimalType.Fixed(precision) => + case DecimalType.Fixed(precision, _) => (row: InternalRow, ordinal: Int) => { val decimal = row.getDecimal(ordinal) - val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision) + val numBytes = CatalystWriteSupport.BYTES_FOR_PRECISION(precision) val unscaledLong = decimal.toUnscaledLong var i = 0 @@ -304,4 +304,15 @@ private[parquet] object CatalystWriteSupport { ParquetOutputFormat.WRITER_VERSION, ParquetProperties.WriterVersion.PARQUET_1_0.toString) } + + /** + * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision. + */ + private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision => + var length = 1 + while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) { + length += 1 + } + length + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala deleted file mode 100644 index 6ed3580af0729..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.types.{MapData, ArrayData} - -// TODO Removes this while fixing SPARK-8848 -private[sql] object CatalystConverter { - // This is mostly Parquet convention (see, e.g., `ConversionPatterns`). - // Note that "array" for the array elements is chosen by ParquetAvro. - // Using a different value will result in Parquet silently dropping columns. - val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag" - val ARRAY_ELEMENTS_SCHEMA_NAME = "array" - - val MAP_KEY_SCHEMA_NAME = "key" - val MAP_VALUE_SCHEMA_NAME = "value" - val MAP_SCHEMA_NAME = "map" - - // TODO: consider using Array[T] for arrays to avoid boxing of primitive types - type ArrayScalaType = ArrayData - type StructScalaType = InternalRow - type MapScalaType = MapData -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala deleted file mode 100644 index 3854f5bd39fb1..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.parquet - -import java.io.IOException - -import scala.collection.JavaConversions._ -import scala.util.Try - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.mapreduce.Job -import org.apache.parquet.format.converter.ParquetMetadataConverter -import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata} -import org.apache.parquet.hadoop.util.ContextUtil -import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter} -import org.apache.parquet.schema.MessageType - -import org.apache.spark.Logging -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.types._ - - -private[parquet] object ParquetTypesConverter extends Logging { - def isPrimitiveType(ctype: DataType): Boolean = ctype match { - case _: NumericType | BooleanType | DateType | TimestampType | StringType | BinaryType => true - case _ => false - } - - /** - * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision. - */ - private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision => - var length = 1 - while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) { - length += 1 - } - length - } - - def convertFromAttributes(attributes: Seq[Attribute]): MessageType = { - val converter = new CatalystSchemaConverter() - converter.convert(StructType.fromAttributes(attributes)) - } - - def convertFromString(string: String): Seq[Attribute] = { - Try(DataType.fromJson(string)).getOrElse(DataType.fromCaseClassString(string)) match { - case s: StructType => s.toAttributes - case other => sys.error(s"Can convert $string to row") - } - } - - def convertToString(schema: Seq[Attribute]): String = { - schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName) - StructType.fromAttributes(schema).json - } - - def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration): Unit = { - if (origPath == null) { - throw new IllegalArgumentException("Unable to write Parquet metadata: path is null") - } - val fs = origPath.getFileSystem(conf) - if (fs == null) { - throw new IllegalArgumentException( - s"Unable to write Parquet metadata: path $origPath is incorrectly formatted") - } - val path = origPath.makeQualified(fs) - if (fs.exists(path) && !fs.getFileStatus(path).isDir) { - throw new IllegalArgumentException(s"Expected to write to directory $path but found file") - } - val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE) - if (fs.exists(metadataPath)) { - try { - fs.delete(metadataPath, true) - } catch { - case e: IOException => - throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE at $metadataPath") - } - } - val extraMetadata = new java.util.HashMap[String, String]() - extraMetadata.put( - CatalystReadSupport.SPARK_METADATA_KEY, - ParquetTypesConverter.convertToString(attributes)) - // TODO: add extra data, e.g., table name, date, etc.? - - val parquetSchema: MessageType = ParquetTypesConverter.convertFromAttributes(attributes) - val metaData: FileMetaData = new FileMetaData( - parquetSchema, - extraMetadata, - "Spark") - - ParquetRelation.enableLogForwarding() - ParquetFileWriter.writeMetadataFile( - conf, - path, - new Footer(path, new ParquetMetadata(metaData, Nil)) :: Nil) - } - - /** - * Try to read Parquet metadata at the given Path. We first see if there is a summary file - * in the parent directory. If so, this is used. Else we read the actual footer at the given - * location. - * @param origPath The path at which we expect one (or more) Parquet files. - * @param configuration The Hadoop configuration to use. - * @return The `ParquetMetadata` containing among other things the schema. - */ - def readMetaData(origPath: Path, configuration: Option[Configuration]): ParquetMetadata = { - if (origPath == null) { - throw new IllegalArgumentException("Unable to read Parquet metadata: path is null") - } - val job = new Job() - val conf = configuration.getOrElse(ContextUtil.getConfiguration(job)) - val fs: FileSystem = origPath.getFileSystem(conf) - if (fs == null) { - throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath") - } - val path = origPath.makeQualified(fs) - - val children = - fs - .globStatus(path) - .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) } - .filterNot { status => - val name = status.getPath.getName - (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE - } - - ParquetRelation.enableLogForwarding() - - // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row - // groups. Since Parquet schema is replicated among all row groups, we only need to touch a - // single row group to read schema related metadata. Notice that we are making assumptions that - // all data in a single Parquet file have the same schema, which is normally true. - children - // Try any non-"_metadata" file first... - .find(_.getPath.getName != ParquetFileWriter.PARQUET_METADATA_FILE) - // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is - // empty, thus normally the "_metadata" file is expected to be fairly small). - .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)) - .map(ParquetFileReader.readFooter(conf, _, ParquetMetadataConverter.NO_FILTER)) - .getOrElse( - throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path")) - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala index b4cdfd9e98f6f..0238f0f0a9a1a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql.parquet + import java.io.File import scala.collection.JavaConversions._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index b415da5b8c136..5a54cf75e6a3f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -204,8 +204,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { test("compression codec") { def compressionCodecFor(path: String): String = { - val codecs = ParquetTypesConverter - .readMetaData(new Path(path), Some(configuration)) + val codecs = readMetadata(new Path(path), configuration) .getBlocks .flatMap(_.getColumns) .map(_.getCodec.name()) @@ -277,15 +276,15 @@ class ParquetIOSuite extends QueryTest with ParquetTest { withTempPath { file => val path = new Path(file.toURI.toString) val fs = FileSystem.getLocal(configuration) - val attributes = ScalaReflection.attributesFor[(Int, String)] - ParquetTypesConverter.writeMetaData(attributes, path, configuration) + val schema = StructType.fromAttributes(ScalaReflection.attributesFor[(Int, String)]) + writeMetadata(schema, path, configuration) assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))) assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE))) - val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration)) + val metaData = readMetadata(path, configuration) val actualSchema = metaData.getFileMetaData.getSchema - val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes) + val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema) actualSchema.checkContains(expectedSchema) expectedSchema.checkContains(actualSchema) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 4a0b3b60f419d..7deba1cf6ebfc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -361,8 +361,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { val jsonString = """{"type":"struct","fields":[{"name":"c1","type":"integer","nullable":false,"metadata":{}},{"name":"c2","type":"binary","nullable":true,"metadata":{}}]}""" // scalastyle:on - val fromCaseClassString = ParquetTypesConverter.convertFromString(caseClassString) - val fromJson = ParquetTypesConverter.convertFromString(jsonString) + val fromCaseClassString = StructType.fromString(caseClassString) + val fromJson = StructType.fromString(jsonString) (fromCaseClassString, fromJson).zipped.foreach { (a, b) => assert(a.name == b.name) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala index 64e94056f209a..c1c7ca9ae5821 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala @@ -19,11 +19,20 @@ package org.apache.spark.sql.parquet import java.io.File +import scala.collection.JavaConverters.{mapAsJavaMapConverter, seqAsJavaListConverter} import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.format.converter.ParquetMetadataConverter +import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetMetadata} +import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter} + import org.apache.spark.SparkFunSuite +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SaveMode} /** @@ -97,4 +106,30 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => assert(partDir.mkdirs(), s"Couldn't create directory $partDir") partDir } + + def writeMetadata(schema: StructType, path: Path, configuration: Configuration): Unit = { + val parquetSchema = new CatalystSchemaConverter(configuration).convert(schema) + val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava + val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}" + val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy) + val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava) + val footer = new Footer(path, parquetMetadata) + ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava) + } + + + def readMetadata(path: Path, configuration: Configuration): ParquetMetadata = { + val summaryFileNames = Seq( + ParquetFileWriter.PARQUET_METADATA_FILE, + ParquetFileWriter.PARQUET_COMMON_METADATA_FILE) + + val fs = path.getFileSystem(configuration) + val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, path).filter { f => + val name = f.getPath.getName + name.startsWith(".") && name.startsWith("_") || summaryFileNames.contains(name) + } + + ParquetFileReader.readFooter( + configuration, leaves.head, ParquetMetadataConverter.SKIP_ROW_GROUPS) + } } From 821e9ec25b755b4f46d715cbee465f2cb4432afb Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 27 Jul 2015 19:43:14 +0800 Subject: [PATCH 03/21] Fixes test failures --- .../sql/parquet/CatalystWriteSupport.scala | 24 +++++++++---------- .../DirectParquetOutputCommitter.scala | 2 +- .../spark/sql/parquet/ParquetIOSuite.scala | 23 +++++++++--------- .../spark/sql/parquet/ParquetTest.scala | 22 +++++++---------- 4 files changed, 33 insertions(+), 38 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 91768e5b59bcb..ca613eb47d871 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -78,13 +78,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } override def write(row: InternalRow): Unit = { - assert(row.numFields == schema.length) - recordConsumer.startMessage() - writeFields(row) - recordConsumer.endMessage() + consumeMessage(writeFields(row, schema)) } - private def writeFields(row: InternalRow): Unit = { + private def writeFields(row: InternalRow, schema: StructType): Unit = { val consumers = schema.map(_.dataType).map(makeConsumer) var i = 0 @@ -145,9 +142,6 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) - case DecimalType.Unlimited => - sys.error(s"Unsupported data type $dataType. Decimal precision must be specified.") - case DecimalType.Fixed(precision, _) if precision > 18 => sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.") @@ -169,9 +163,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) } - case StructType(fields) => + case structType @ StructType(fields) => (row: InternalRow, ordinal: Int) => - consumeGroup(writeFields(row.getStruct(ordinal, fields.length))) + consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType)) case arrayType: ArrayType if followParquetFormatSpec => makeStandardArrayConsumer(arrayType.elementType) @@ -214,7 +208,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { consumeField(repeatedGroupName, 0) { - val array = row.get(ordinal).asInstanceOf[Array[_]] + val array = row.get(ordinal).asInstanceOf[Seq[_]] var i = 0 while (i < array.length) { @@ -241,7 +235,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { consumeField(repeatedFieldName, 0) { - val array = row.get(ordinal).asInstanceOf[Array[_]] + val array = row.get(ordinal).asInstanceOf[Seq[_]] var i = 0 while (i < array.length) { @@ -281,6 +275,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } + private def consumeMessage(f: => Unit): Unit = { + recordConsumer.startMessage() + f + recordConsumer.endMessage() + } + private def consumeGroup(f: => Unit): Unit = { recordConsumer.startGroup() f diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala index 1551afd7b7bf2..46cfa9dc0a0bc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala @@ -39,7 +39,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetO * * NEVER use [[DirectParquetOutputCommitter]] when appending data, because currently there's * no safe way undo a failed appending job (that's why both `abortTask()` and `abortJob()` are - * left * empty). + * left empty). */ private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index 5a54cf75e6a3f..2ea64c715b9a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -29,7 +29,7 @@ import org.apache.parquet.example.data.{Group, GroupWriter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata} -import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter} +import org.apache.parquet.hadoop._ import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} @@ -203,14 +203,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest { } test("compression codec") { - def compressionCodecFor(path: String): String = { - val codecs = readMetadata(new Path(path), configuration) - .getBlocks - .flatMap(_.getColumns) - .map(_.getCodec.name()) - .distinct - - assert(codecs.size === 1) + def compressionCodecFor(path: String, codecName: String): String = { + val codecs = for { + footer <- readAllFootersWithoutSummaryFiles(new Path(path), configuration) + block <- footer.getParquetMetadata.getBlocks + column <- block.getColumns + } yield column.getCodec.name() + + assert(codecs.distinct === Seq(codecName)) codecs.head } @@ -220,7 +220,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> codec.name()) { withParquetFile(data) { path => assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) { - compressionCodecFor(path) + compressionCodecFor(path, codec.name()) } } } @@ -282,9 +282,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest { assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))) assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE))) - val metaData = readMetadata(path, configuration) - val actualSchema = metaData.getFileMetaData.getSchema val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema) + val actualSchema = readFooter(path, configuration).getFileMetaData.getSchema actualSchema.checkContains(expectedSchema) expectedSchema.checkContains(actualSchema) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala index c1c7ca9ae5821..f5dc9051a2cc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.parquet import java.io.File -import scala.collection.JavaConverters.{mapAsJavaMapConverter, seqAsJavaListConverter} +import scala.collection.JavaConverters.{iterableAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag @@ -30,7 +30,6 @@ import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetM import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter} import org.apache.spark.SparkFunSuite -import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SaveMode} @@ -117,19 +116,16 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava) } - - def readMetadata(path: Path, configuration: Configuration): ParquetMetadata = { - val summaryFileNames = Seq( - ParquetFileWriter.PARQUET_METADATA_FILE, - ParquetFileWriter.PARQUET_COMMON_METADATA_FILE) - + def readAllFootersWithoutSummaryFiles( + path: Path, configuration: Configuration): Seq[Footer] = { val fs = path.getFileSystem(configuration) - val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, path).filter { f => - val name = f.getPath.getName - name.startsWith(".") && name.startsWith("_") || summaryFileNames.contains(name) - } + ParquetFileReader.readAllFootersInParallel(configuration, fs.getFileStatus(path)).asScala.toSeq + } + def readFooter(path: Path, configuration: Configuration): ParquetMetadata = { ParquetFileReader.readFooter( - configuration, leaves.head, ParquetMetadataConverter.SKIP_ROW_GROUPS) + configuration, + new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE), + ParquetMetadataConverter.NO_FILTER) } } From 2a1e884d003d1fce3b439f219500dc375cb2d1b7 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 27 Jul 2015 20:43:28 +0800 Subject: [PATCH 04/21] Fixes writing UDT --- .../sql/parquet/CatalystWriteSupport.scala | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index ca613eb47d871..555b29e3d9289 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { - type ValueConsumer = (InternalRow, Int) => Unit + type ValueWriter = (InternalRow, Int) => Unit private var schema: StructType = _ @@ -82,13 +82,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } private def writeFields(row: InternalRow, schema: StructType): Unit = { - val consumers = schema.map(_.dataType).map(makeConsumer) + val writers = schema.map(_.dataType).map(makeWriter) var i = 0 while (i < row.numFields) { if (!row.isNullAt(i)) { consumeField(schema(i).name, i) { - consumers(i).apply(row, i) + writers(i).apply(row, i) } } @@ -96,7 +96,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeConsumer(dataType: DataType): ValueConsumer = { + private def makeWriter(dataType: DataType): ValueWriter = { dataType match { case BooleanType => (row: InternalRow, ordinal: Int) => @@ -168,41 +168,44 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType)) case arrayType: ArrayType if followParquetFormatSpec => - makeStandardArrayConsumer(arrayType.elementType) + makeStandardArrayWriter(arrayType.elementType) case arrayType: ArrayType if !followParquetFormatSpec => - makeLegacyArrayConsumer(arrayType.elementType, arrayType.containsNull) + makeLegacyArrayWriter(arrayType.elementType, arrayType.containsNull) case mapType: MapType if followParquetFormatSpec => - makeMapConsumer(mapType.keyType, mapType.valueType, "key_value") + makeMapWriter(mapType.keyType, mapType.valueType, "key_value") case mapType: MapType if !followParquetFormatSpec => - makeMapConsumer(mapType.keyType, mapType.valueType, "map") + makeMapWriter(mapType.keyType, mapType.valueType, "map") + + case udt: UserDefinedType[_] => + makeWriter(udt.sqlType) case _ => sys.error(s"Unsupported data type $dataType.") } } - private def makeStandardArrayConsumer(elementType: DataType): ValueConsumer = { - makeThreeLevelArrayConsumer(elementType, "list", "element") + private def makeStandardArrayWriter(elementType: DataType): ValueWriter = { + makeThreeLevelArrayWriter(elementType, "list", "element") } - private def makeLegacyArrayConsumer( + private def makeLegacyArrayWriter( elementType: DataType, - containsNull: Boolean): ValueConsumer = { + containsNull: Boolean): ValueWriter = { if (containsNull) { - makeThreeLevelArrayConsumer(elementType, "bag", "array") + makeThreeLevelArrayWriter(elementType, "bag", "array") } else { - makeTwoLevelArrayConsumer(elementType, "array") + makeTwoLevelArrayWriter(elementType, "array") } } - private def makeThreeLevelArrayConsumer( + private def makeThreeLevelArrayWriter( elementType: DataType, repeatedGroupName: String, - elementFieldName: String): ValueConsumer = { - val elementConsumer = makeConsumer(elementType) + elementFieldName: String): ValueWriter = { + val elementWriter = makeWriter(elementType) val mutableRow = new SpecificMutableRow(elementType :: Nil) (row: InternalRow, ordinal: Int) => { @@ -215,7 +218,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi consumeGroup { if (array(i) != null) { mutableRow.update(0, array(i)) - consumeField(elementFieldName, 0)(elementConsumer.apply(mutableRow, 0)) + consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) } } @@ -226,10 +229,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeTwoLevelArrayConsumer( + private def makeTwoLevelArrayWriter( elementType: DataType, - repeatedFieldName: String): ValueConsumer = { - val elementConsumer = makeConsumer(elementType) + repeatedFieldName: String): ValueWriter = { + val elementWriter = makeWriter(elementType) val mutableRow = new SpecificMutableRow(elementType :: Nil) (row: InternalRow, ordinal: Int) => { @@ -240,7 +243,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi while (i < array.length) { mutableRow.update(0, array(i)) - elementConsumer.apply(mutableRow, 0) + elementWriter.apply(mutableRow, 0) i += 1 } } @@ -248,12 +251,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeMapConsumer( + private def makeMapWriter( keyType: DataType, valueType: DataType, - repeatedGroupName: String): ValueConsumer = { - val keyConsumer = makeConsumer(keyType) - val valueConsumer = makeConsumer(valueType) + repeatedGroupName: String): ValueWriter = { + val keyWriter = makeWriter(keyType) + val valueWriter = makeWriter(valueType) val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) (row: InternalRow, ordinal: Int) => { @@ -263,10 +266,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi for ((key, value) <- map) { consumeGroup { mutableRow.update(0, key) - consumeField("key", 0)(keyConsumer.apply(mutableRow, 0)) + consumeField("key", 0)(keyWriter.apply(mutableRow, 0)) if (value != null) { mutableRow.update(1, value) - consumeField("value", 1)(valueConsumer.apply(mutableRow, 1)) + consumeField("value", 1)(valueWriter.apply(mutableRow, 1)) } } } From e9638f01f7c09d5664ac6b36c26d1831c870f710 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 27 Jul 2015 21:17:07 +0800 Subject: [PATCH 05/21] Optimizes writing structs --- .../sql/parquet/CatalystWriteSupport.scala | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 555b29e3d9289..4218e6f308456 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -40,6 +40,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private var schema: StructType = _ + private var rootFieldWriters: Seq[ValueWriter] = _ + private var recordConsumer: RecordConsumer = _ private var followParquetFormatSpec: Boolean = _ @@ -53,6 +55,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi override def init(configuration: Configuration): WriteContext = { val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA) schema = StructType.fromString(schemaString) + rootFieldWriters = schema.map(_.dataType).map(makeWriter) assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null) followParquetFormatSpec = @@ -78,17 +81,17 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } override def write(row: InternalRow): Unit = { - consumeMessage(writeFields(row, schema)) + consumeMessage(writeFields(row, schema, rootFieldWriters)) } - private def writeFields(row: InternalRow, schema: StructType): Unit = { - val writers = schema.map(_.dataType).map(makeWriter) + private def writeFields( + row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = { var i = 0 while (i < row.numFields) { if (!row.isNullAt(i)) { consumeField(schema(i).name, i) { - writers(i).apply(row, i) + fieldWriters(i).apply(row, i) } } @@ -163,9 +166,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) } - case structType @ StructType(fields) => + case structType: StructType => + val fieldWriters = structType.map(_.dataType).map(makeWriter) (row: InternalRow, ordinal: Int) => - consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType)) + consumeGroup { + val struct = row.getStruct(ordinal, structType.length) + writeFields(struct, structType, fieldWriters) + } case arrayType: ArrayType if followParquetFormatSpec => makeStandardArrayWriter(arrayType.elementType) From a2aeba5dd834b7232d91c227f67d49375b185b56 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 27 Jul 2015 22:01:30 +0800 Subject: [PATCH 06/21] Fixes writing empty arrays and maps --- .../sql/parquet/CatalystWriteSupport.scala | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 4218e6f308456..54096b3515100 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -217,19 +217,19 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { - consumeField(repeatedGroupName, 0) { - val array = row.get(ordinal).asInstanceOf[Seq[_]] - var i = 0 - - while (i < array.length) { - consumeGroup { - if (array(i) != null) { - mutableRow.update(0, array(i)) - consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) + val array = row.get(ordinal).asInstanceOf[Seq[_]] + if (array.nonEmpty) { + consumeField(repeatedGroupName, 0) { + var i = 0 + while (i < array.length) { + consumeGroup { + if (array(i) != null) { + mutableRow.update(0, array(i)) + consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) + } } + i += 1 } - - i += 1 } } } @@ -244,14 +244,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { - consumeField(repeatedFieldName, 0) { - val array = row.get(ordinal).asInstanceOf[Seq[_]] - var i = 0 - - while (i < array.length) { - mutableRow.update(0, array(i)) - elementWriter.apply(mutableRow, 0) - i += 1 + val array = row.get(ordinal).asInstanceOf[Seq[_]] + if (array.nonEmpty) { + consumeField(repeatedFieldName, 0) { + var i = 0 + while (i < array.length) { + mutableRow.update(0, array(i)) + elementWriter.apply(mutableRow, 0) + i += 1 + } } } } @@ -268,15 +269,17 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { - consumeField(repeatedGroupName, 0) { - val map = row.get(ordinal).asInstanceOf[Map[_, _]] - for ((key, value) <- map) { - consumeGroup { - mutableRow.update(0, key) - consumeField("key", 0)(keyWriter.apply(mutableRow, 0)) - if (value != null) { - mutableRow.update(1, value) - consumeField("value", 1)(valueWriter.apply(mutableRow, 1)) + val map = row.get(ordinal).asInstanceOf[Map[_, _]] + if (map.nonEmpty) { + consumeField(repeatedGroupName, 0) { + for ((key, value) <- map) { + consumeGroup { + mutableRow.update(0, key) + consumeField("key", 0)(keyWriter.apply(mutableRow, 0)) + if (value != null) { + mutableRow.update(1, value) + consumeField("value", 1)(valueWriter.apply(mutableRow, 1)) + } } } } From 678ccd4ec69ad5a082bec566a7270242d69f9fa5 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 28 Jul 2015 21:28:28 +0800 Subject: [PATCH 07/21] Migrates large decimal precision support --- .../org/apache/spark/sql/types/Decimal.scala | 4 +- .../sql/parquet/CatalystRowConverter.scala | 12 ++- .../sql/parquet/CatalystSchemaConverter.scala | 43 ++++----- .../sql/parquet/CatalystWriteSupport.scala | 92 ++++++++++++++----- 4 files changed, 101 insertions(+), 50 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index c0155eeb450a6..6754c4713830c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -106,7 +106,9 @@ final class Decimal extends Ordered[Decimal] with Serializable { */ def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = { this.decimalVal = decimal.setScale(scale, ROUNDING_MODE) - require(decimalVal.precision <= precision, "Overflowed precision") + require( + decimalVal.precision <= precision, + s"Precision overflow. Max precision: $precision, got: ${decimalVal.precision}") this.longVal = 0L this._precision = precision this._scale = scale diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala index 6938b071065cd..f294e12a8f005 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala @@ -264,7 +264,7 @@ private[parquet] class CatalystRowConverter( val scale = decimalType.scale val bytes = value.getBytes - if (precision <= 8) { + def bytesToUnscaledLong(bytes: Array[Byte]): Long = { // Constructs a `Decimal` with an unscaled `Long` value if possible. var unscaled = 0L var i = 0 @@ -275,11 +275,17 @@ private[parquet] class CatalystRowConverter( } val bits = 8 * bytes.length - unscaled = (unscaled << (64 - bits)) >> (64 - bits) + (unscaled << (64 - bits)) >> (64 - bits) + } + + if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) { + // Constructs a `Decimal` with an unscaled `Long` value if possible. + val unscaled = bytesToUnscaledLong(bytes) Decimal(unscaled, precision, scale) } else { // Otherwise, resorts to an unscaled `BigInteger` instead. - Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale) + val unscaled = new BigInteger(bytes) + Decimal(new BigDecimal(unscaled, scale), precision, scale) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index 41b3c9d73e0af..ceb7a5479c9bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -25,6 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition._ import org.apache.parquet.schema._ +import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision} import org.apache.spark.sql.types._ import org.apache.spark.sql.{AnalysisException, SQLConf} @@ -155,7 +156,7 @@ private[parquet] class CatalystSchemaConverter( case INT_16 => ShortType case INT_32 | null => IntegerType case DATE => DateType - case DECIMAL => makeDecimalType(maxPrecisionForBytes(4)) + case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32) case TIME_MILLIS => typeNotImplemented() case _ => illegalType() } @@ -163,7 +164,7 @@ private[parquet] class CatalystSchemaConverter( case INT64 => originalType match { case INT_64 | null => LongType - case DECIMAL => makeDecimalType(maxPrecisionForBytes(8)) + case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64) case TIMESTAMP_MILLIS => typeNotImplemented() case _ => illegalType() } @@ -396,7 +397,7 @@ private[parquet] class CatalystSchemaConverter( .as(DECIMAL) .precision(precision) .scale(scale) - .length(CatalystSchemaConverter.minBytesForPrecision(precision)) + .length(minBytesForPrecision(precision)) .named(field.name) // ===================================== @@ -405,7 +406,7 @@ private[parquet] class CatalystSchemaConverter( // Uses INT32 for 1 <= precision <= 9 case DecimalType.Fixed(precision, scale) - if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec => Types .primitive(INT32, repetition) .as(DECIMAL) @@ -413,9 +414,9 @@ private[parquet] class CatalystSchemaConverter( .scale(scale) .named(field.name) - // Uses INT64 for 1 <= precision <= 18 + // Uses INT64 for 10 <= precision <= 18 case DecimalType.Fixed(precision, scale) - if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec => Types .primitive(INT64, repetition) .as(DECIMAL) @@ -430,7 +431,7 @@ private[parquet] class CatalystSchemaConverter( .as(DECIMAL) .precision(precision) .scale(scale) - .length(CatalystSchemaConverter.minBytesForPrecision(precision)) + .length(minBytesForPrecision(precision)) .named(field.name) // =================================================== @@ -534,14 +535,6 @@ private[parquet] class CatalystSchemaConverter( throw new AnalysisException(s"Unsupported data type $field.dataType") } } - - // Max precision of a decimal value stored in `numBytes` bytes - private def maxPrecisionForBytes(numBytes: Int): Int = { - Math.round( // convert double to long - Math.floor(Math.log10( // number of base-10 digits - Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes - .asInstanceOf[Int] - } } @@ -566,7 +559,8 @@ private[parquet] object CatalystSchemaConverter { } } - private def computeMinBytesForPrecision(precision : Int) : Int = { + // The minimum number of bytes needed to store a decimal with a given `precision`. + val minBytesForPrecision = Array.tabulate[Int](DecimalType.MAX_PRECISION + 1) { precision => var numBytes = 1 while (math.pow(2.0, 8 * numBytes - 1) < math.pow(10.0, precision)) { numBytes += 1 @@ -574,14 +568,15 @@ private[parquet] object CatalystSchemaConverter { numBytes } - private val MIN_BYTES_FOR_PRECISION = Array.tabulate[Int](39)(computeMinBytesForPrecision) + val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) - // Returns the minimum number of bytes needed to store a decimal with a given `precision`. - def minBytesForPrecision(precision : Int) : Int = { - if (precision < MIN_BYTES_FOR_PRECISION.length) { - MIN_BYTES_FOR_PRECISION(precision) - } else { - computeMinBytesForPrecision(precision) - } + val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8) + + // Max precision of a decimal value stored in `numBytes` bytes + def maxPrecisionForBytes(numBytes: Int): Int = { + Math.round( // convert double to long + Math.floor(Math.log10( // number of base-10 digits + Math.pow(2, 8 * numBytes - 1) - 1))) // max value stored in numBytes + .asInstanceOf[Int] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 54096b3515100..54b610dca770b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.parquet import java.nio.{ByteBuffer, ByteOrder} +import java.util import scala.collection.JavaConverters.mapAsJavaMapConverter @@ -33,24 +34,30 @@ import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision} import org.apache.spark.sql.types._ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { + // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer type ValueWriter = (InternalRow, Int) => Unit + // Schema of the `InternalRow`s to be written private var schema: StructType = _ + // `ValueWriter`s for all fields of the schema private var rootFieldWriters: Seq[ValueWriter] = _ + // The Parquet `RecordConsumer` to which all `InternalRow`s are written private var recordConsumer: RecordConsumer = _ + // Whether we should write standard Parquet data conforming to parquet-format spec or not private var followParquetFormatSpec: Boolean = _ - // Byte array used to write timestamps as Parquet INT96 values + // Reusable byte array used to write timestamps as Parquet INT96 values private val timestampBuffer = new Array[Byte](12) - // Byte array used to write decimal values - private val decimalBuffer = new Array[Byte](8) + // Reusable byte array used to write decimal values + private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION)) override def init(configuration: Configuration): WriteContext = { val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA) @@ -145,26 +152,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) - case DecimalType.Fixed(precision, _) if precision > 18 => - sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.") - case DecimalType.Fixed(precision, _) => - (row: InternalRow, ordinal: Int) => { - val decimal = row.getDecimal(ordinal) - val numBytes = CatalystWriteSupport.BYTES_FOR_PRECISION(precision) - val unscaledLong = decimal.toUnscaledLong - - var i = 0 - var shift = 8 * (numBytes - 1) - - while (i < numBytes) { - decimalBuffer(i) = (unscaledLong >> shift).toByte - i += 1 - shift -= 8 - } - - recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) - } + makeDecimalWriter(precision) case structType: StructType => val fieldWriters = structType.map(_.dataType).map(makeWriter) @@ -194,6 +183,65 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } + private def makeDecimalWriter(precision: Int): ValueWriter = { + assert( + precision <= DecimalType.MAX_PRECISION, + s"Precision overflow: $precision is greater than ${DecimalType.MAX_PRECISION}") + + val numBytes = minBytesForPrecision(precision) + + val int32Writer = + (row: InternalRow, ordinal: Int) => + recordConsumer.addInteger(row.getDecimal(ordinal).toUnscaledLong.toInt) + + val int64Writer = + (row: InternalRow, ordinal: Int) => + recordConsumer.addLong(row.getDecimal(ordinal).toUnscaledLong) + + val binaryWriterUsingUnscaledLong = + (row: InternalRow, ordinal: Int) => { + // This writer converts underlying unscaled Long value to raw bytes using a reusable byte + // array to minimize array allocation. + + val unscaled = row.getDecimal(ordinal).toUnscaledLong + var i = 0 + var shift = 8 * (numBytes - 1) + + while (i < numBytes) { + decimalBuffer(i) = (unscaled >> shift).toByte + i += 1 + shift -= 8 + } + + recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) + } + + val binaryWriterUsingUnscaledBytes = + (row: InternalRow, ordinal: Int) => { + val decimal = row.getDecimal(ordinal) + val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray + util.Arrays.fill(decimalBuffer, 0: Byte) + System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length) + recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) + } + + followParquetFormatSpec match { + // Standard mode, writes decimals with precision <= 9 as INT32 + case true if precision <= MAX_PRECISION_FOR_INT32 => int32Writer + + // Standard mode, writes decimals with precision <= 18 as INT64 + case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer + + // Legacy mode, writes decimals with precision <= 18 as BINARY + case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong + + // All other cases: + // - Standard mode, writes decimals with precision > 18 as BINARY + // - Legacy mode, writes decimals with all precision as BINARY + case _ => binaryWriterUsingUnscaledBytes + } + } + private def makeStandardArrayWriter(elementType: DataType): ValueWriter = { makeThreeLevelArrayWriter(elementType, "list", "element") } From b9f93dbdaf7c710e4ef48c828f5b305a66e549d9 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 28 Jul 2015 23:38:02 +0800 Subject: [PATCH 08/21] Fixes writing negative decimal values --- .../spark/sql/parquet/CatalystWriteSupport.scala | 13 ++++++++++--- .../apache/spark/sql/parquet/ParquetIOSuite.scala | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 54b610dca770b..7df1192be9c57 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -220,9 +220,16 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { val decimal = row.getDecimal(ordinal) val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray - util.Arrays.fill(decimalBuffer, 0: Byte) - System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length) - recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes)) + val binary = if (bytes.length == numBytes) { + bytes + } else { + val signByte = if (bytes.head < 0) -1: Byte else 0: Byte + util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte) + System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length) + decimalBuffer + } + + recordConsumer.addBinary(Binary.fromByteArray(binary, 0, numBytes)) } followParquetFormatSpec match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index 2ea64c715b9a3..24ab233d22cb9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -101,7 +101,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { def makeDecimalRDD(decimal: DecimalType): DataFrame = sqlContext.sparkContext .parallelize(0 to 1000) - .map(i => Tuple1(i / 100.0)) + .map(i => Tuple1((i - 500) / 100.0)) .toDF() // Parquet doesn't allow column names with spaces, have to add an alias here .select($"_1" cast decimal as "dec") From 0e0a957d25b94b8121867d24730490b7b4bde2be Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Wed, 29 Jul 2015 01:37:38 +0800 Subject: [PATCH 09/21] Minor comment updates --- .../apache/spark/sql/parquet/CatalystWriteSupport.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 7df1192be9c57..ec87d5bc88bdc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -141,7 +141,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes)) case TimestampType => + // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it (row: InternalRow, ordinal: Int) => { + // Actually Spark SQL `TimestampType` only has microsecond precision. val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal)) val buf = ByteBuffer.wrap(timestampBuffer) buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay) @@ -239,12 +241,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi // Standard mode, writes decimals with precision <= 18 as INT64 case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer - // Legacy mode, writes decimals with precision <= 18 as BINARY + // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong // All other cases: - // - Standard mode, writes decimals with precision > 18 as BINARY - // - Legacy mode, writes decimals with all precision as BINARY + // - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY + // - Legacy mode, writes decimals with all precision as FIXED_LEN_BYTE_ARRAY case _ => binaryWriterUsingUnscaledBytes } } From 2859132e33a1d8d4e86016feca1d0876d162b380 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Wed, 29 Jul 2015 17:35:36 +0800 Subject: [PATCH 10/21] Fixes array type conversion in legacy mode --- .../sql/parquet/CatalystSchemaConverter.scala | 14 ++++++++------ .../spark/sql/parquet/ParquetSchemaSuite.scala | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index ceb7a5479c9bb..acfa1b1ab6443 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -439,12 +439,14 @@ private[parquet] class CatalystSchemaConverter( // =================================================== // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level - // LIST structure. This behavior mimics parquet-hive (1.6.0rc3). Note that this case is - // covered by the backwards-compatibility rules implemented in `isElementType()`. + // LIST structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro + // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous + // field name "array" is from parquet-avro. Note that this case is covered by the backwards- + // compatibility rules implemented in `isElementType()`. case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec => // group (LIST) { // optional group bag { - // repeated element; + // repeated array; // } // } ConversionPatterns.listType( @@ -452,8 +454,8 @@ private[parquet] class CatalystSchemaConverter( field.name, Types .buildGroup(REPEATED) - // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version) - .addField(convertField(StructField("array_element", elementType, nullable))) + // "array" is the name chosen by Spark SQL 1.4.0 and prior versions + .addField(convertField(StructField("array", elementType, nullable))) .named("bag")) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level @@ -461,7 +463,7 @@ private[parquet] class CatalystSchemaConverter( // covered by the backwards-compatibility rules implemented in `isElementType()`. case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec => // group (LIST) { - // repeated element; + // repeated array; // } ConversionPatterns.listType( repetition, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 7deba1cf6ebfc..ec6e37cd5ee85 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { | repeated group bag { - | optional int32 array_element; + | optional int32 array; | } | } |} @@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _1 (UTF8); | optional group _2 (LIST) { | repeated group bag { - | optional group array_element { + | optional group array { | required int32 _1; | required double _2; | } @@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional group f1 (LIST) { | repeated group bag { - | optional int32 array_element; + | optional int32 array; | } | } |} From 62c482930a61427e0eed7257080abcf23fee25b0 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Wed, 29 Jul 2015 18:12:35 +0800 Subject: [PATCH 11/21] Minor refactoring --- .../sql/parquet/CatalystWriteSupport.scala | 88 ++++++++----------- 1 file changed, 36 insertions(+), 52 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index ec87d5bc88bdc..3ffb7cb0dd180 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -157,28 +157,28 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi case DecimalType.Fixed(precision, _) => makeDecimalWriter(precision) - case structType: StructType => - val fieldWriters = structType.map(_.dataType).map(makeWriter) + case t: StructType => + val fieldWriters = t.map(_.dataType).map(makeWriter) (row: InternalRow, ordinal: Int) => - consumeGroup { - val struct = row.getStruct(ordinal, structType.length) - writeFields(struct, structType, fieldWriters) - } + consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)) + + case ArrayType(elementType, _) if followParquetFormatSpec => + makeThreeLevelArrayWriter(elementType, "list", "element") - case arrayType: ArrayType if followParquetFormatSpec => - makeStandardArrayWriter(arrayType.elementType) + case ArrayType(elementType, true) if !followParquetFormatSpec => + makeThreeLevelArrayWriter(elementType, "bag", "array") - case arrayType: ArrayType if !followParquetFormatSpec => - makeLegacyArrayWriter(arrayType.elementType, arrayType.containsNull) + case ArrayType(elementType, false) if !followParquetFormatSpec => + makeTwoLevelArrayWriter(elementType, "array") - case mapType: MapType if followParquetFormatSpec => - makeMapWriter(mapType.keyType, mapType.valueType, "key_value") + case t: MapType if followParquetFormatSpec => + makeMapWriter(t, "key_value") - case mapType: MapType if !followParquetFormatSpec => - makeMapWriter(mapType.keyType, mapType.valueType, "map") + case t: MapType if !followParquetFormatSpec => + makeMapWriter(t, "map") - case udt: UserDefinedType[_] => - makeWriter(udt.sqlType) + case t: UserDefinedType[_] => + makeWriter(t.sqlType) case _ => sys.error(s"Unsupported data type $dataType.") @@ -202,9 +202,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val binaryWriterUsingUnscaledLong = (row: InternalRow, ordinal: Int) => { - // This writer converts underlying unscaled Long value to raw bytes using a reusable byte - // array to minimize array allocation. - + // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we + // can build a fixed-length byte array with length `numBytes` using the unscaled `Long` + // value and the `decimalBuffer` for better performance. val unscaled = row.getDecimal(ordinal).toUnscaledLong var i = 0 var shift = 8 * (numBytes - 1) @@ -220,18 +220,22 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val binaryWriterUsingUnscaledBytes = (row: InternalRow, ordinal: Int) => { - val decimal = row.getDecimal(ordinal) - val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray - val binary = if (bytes.length == numBytes) { + val bytes = row.getDecimal(ordinal).toJavaBigDecimal.unscaledValue().toByteArray + val fixedLengthBytes = if (bytes.length == numBytes) { + // If the length of the underlying byte array of the unscaled `BigInteger` happens to be + // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`. bytes } else { + // Otherwise, the length must be less than `numBytes`. In this case we copy contents of + // the underlying bytes with enough sign bytes to `decimalBuffer` to form the result + // fixed-length byte array. val signByte = if (bytes.head < 0) -1: Byte else 0: Byte util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte) System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length) decimalBuffer } - recordConsumer.addBinary(Binary.fromByteArray(binary, 0, numBytes)) + recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes)) } followParquetFormatSpec match { @@ -251,30 +255,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeStandardArrayWriter(elementType: DataType): ValueWriter = { - makeThreeLevelArrayWriter(elementType, "list", "element") - } - - private def makeLegacyArrayWriter( - elementType: DataType, - containsNull: Boolean): ValueWriter = { - if (containsNull) { - makeThreeLevelArrayWriter(elementType, "bag", "array") - } else { - makeTwoLevelArrayWriter(elementType, "array") - } - } - private def makeThreeLevelArrayWriter( - elementType: DataType, - repeatedGroupName: String, - elementFieldName: String): ValueWriter = { + elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = { val elementWriter = makeWriter(elementType) val mutableRow = new SpecificMutableRow(elementType :: Nil) (row: InternalRow, ordinal: Int) => { consumeGroup { - val array = row.get(ordinal).asInstanceOf[Seq[_]] + val array = row.genericGet(ordinal).asInstanceOf[Seq[_]] if (array.nonEmpty) { consumeField(repeatedGroupName, 0) { var i = 0 @@ -294,14 +282,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } private def makeTwoLevelArrayWriter( - elementType: DataType, - repeatedFieldName: String): ValueWriter = { + elementType: DataType, repeatedFieldName: String): ValueWriter = { val elementWriter = makeWriter(elementType) val mutableRow = new SpecificMutableRow(elementType :: Nil) (row: InternalRow, ordinal: Int) => { consumeGroup { - val array = row.get(ordinal).asInstanceOf[Seq[_]] + val array = row.genericGet(ordinal).asInstanceOf[Seq[_]] if (array.nonEmpty) { consumeField(repeatedFieldName, 0) { var i = 0 @@ -316,17 +303,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeMapWriter( - keyType: DataType, - valueType: DataType, - repeatedGroupName: String): ValueWriter = { - val keyWriter = makeWriter(keyType) - val valueWriter = makeWriter(valueType) - val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) + private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = { + val keyWriter = makeWriter(mapType.keyType) + val valueWriter = makeWriter(mapType.valueType) + val mutableRow = new SpecificMutableRow(mapType.keyType :: mapType.valueType :: Nil) (row: InternalRow, ordinal: Int) => { consumeGroup { - val map = row.get(ordinal).asInstanceOf[Map[_, _]] + val map = row.get(ordinal, mapType).asInstanceOf[Map[_, _]] if (map.nonEmpty) { consumeField(repeatedGroupName, 0) { for ((key, value) <- map) { From b37fe7724fa6780ae4c5952d71a4e6b2e10fd959 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Fri, 31 Jul 2015 18:26:25 +0800 Subject: [PATCH 12/21] Fixes compilation error introduced while rebasing --- .../sql/parquet/CatalystWriteSupport.scala | 19 ++++++++++--------- .../spark/sql/parquet/ParquetRelation.scala | 3 +-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 3ffb7cb0dd180..db1d418fc1714 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -154,8 +154,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) - case DecimalType.Fixed(precision, _) => - makeDecimalWriter(precision) + case DecimalType.Fixed(precision, scale) => + makeDecimalWriter(precision, scale) case t: StructType => val fieldWriters = t.map(_.dataType).map(makeWriter) @@ -185,7 +185,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } - private def makeDecimalWriter(precision: Int): ValueWriter = { + private def makeDecimalWriter(precision: Int, scale: Int): ValueWriter = { assert( precision <= DecimalType.MAX_PRECISION, s"Precision overflow: $precision is greater than ${DecimalType.MAX_PRECISION}") @@ -194,18 +194,18 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val int32Writer = (row: InternalRow, ordinal: Int) => - recordConsumer.addInteger(row.getDecimal(ordinal).toUnscaledLong.toInt) + recordConsumer.addInteger(row.getDecimal(ordinal, precision, scale).toUnscaledLong.toInt) val int64Writer = (row: InternalRow, ordinal: Int) => - recordConsumer.addLong(row.getDecimal(ordinal).toUnscaledLong) + recordConsumer.addLong(row.getDecimal(ordinal, precision, scale).toUnscaledLong) val binaryWriterUsingUnscaledLong = (row: InternalRow, ordinal: Int) => { // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we // can build a fixed-length byte array with length `numBytes` using the unscaled `Long` // value and the `decimalBuffer` for better performance. - val unscaled = row.getDecimal(ordinal).toUnscaledLong + val unscaled = row.getDecimal(ordinal, precision, scale).toUnscaledLong var i = 0 var shift = 8 * (numBytes - 1) @@ -220,14 +220,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val binaryWriterUsingUnscaledBytes = (row: InternalRow, ordinal: Int) => { - val bytes = row.getDecimal(ordinal).toJavaBigDecimal.unscaledValue().toByteArray + val decimal = row.getDecimal(ordinal, precision, scale) + val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray val fixedLengthBytes = if (bytes.length == numBytes) { // If the length of the underlying byte array of the unscaled `BigInteger` happens to be // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`. bytes } else { // Otherwise, the length must be less than `numBytes`. In this case we copy contents of - // the underlying bytes with enough sign bytes to `decimalBuffer` to form the result + // the underlying bytes with padding sign bytes to `decimalBuffer` to form the result // fixed-length byte array. val signByte = if (bytes.head < 0) -1: Byte else 0: Byte util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte) @@ -250,7 +251,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi // All other cases: // - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY - // - Legacy mode, writes decimals with all precision as FIXED_LEN_BYTE_ARRAY + // - Legacy mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY case _ => binaryWriterUsingUnscaledBytes } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index 2d98792385664..b7202375dc128 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -38,12 +38,11 @@ import org.apache.parquet.schema.MessageType import org.apache.parquet.{Log => ParquetLog} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD} import org.apache.spark.rdd.RDD._ +import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.PartitionSpec -import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{SerializableConfiguration, Utils} From 0d61b3bda95bd825f9a667dcfe8d0f5df8e6c594 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Fri, 31 Jul 2015 19:04:47 +0800 Subject: [PATCH 13/21] Writes arrays using ArrayData --- .../sql/parquet/CatalystWriteSupport.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index db1d418fc1714..0606d03a1b9f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -263,14 +263,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { - val array = row.genericGet(ordinal).asInstanceOf[Seq[_]] - if (array.nonEmpty) { + val array = row.getArray(ordinal) + if (array.numElements() > 0) { consumeField(repeatedGroupName, 0) { var i = 0 - while (i < array.length) { + while (i < array.numElements()) { consumeGroup { - if (array(i) != null) { - mutableRow.update(0, array(i)) + if (!array.isNullAt(i)) { + mutableRow.update(0, array.get(i)) consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) } } @@ -289,12 +289,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi (row: InternalRow, ordinal: Int) => { consumeGroup { - val array = row.genericGet(ordinal).asInstanceOf[Seq[_]] - if (array.nonEmpty) { + val array = row.getArray(ordinal) + if (array.numElements() > 0) { consumeField(repeatedFieldName, 0) { var i = 0 - while (i < array.length) { - mutableRow.update(0, array(i)) + while (i < array.numElements()) { + mutableRow.update(0, array.get(i)) elementWriter.apply(mutableRow, 0) i += 1 } From f901a16d10ce7ac7c6be0e54a3c15669e9189ad4 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sat, 1 Aug 2015 22:54:12 +0800 Subject: [PATCH 14/21] Writes maps using MapData --- .../sql/parquet/CatalystWriteSupport.scala | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 0606d03a1b9f8..0b28543744d93 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.types._ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer - type ValueWriter = (InternalRow, Int) => Unit + private type ValueWriter = (InternalRow, Int) => Unit // Schema of the `InternalRow`s to be written private var schema: StructType = _ @@ -270,7 +270,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi while (i < array.numElements()) { consumeGroup { if (!array.isNullAt(i)) { - mutableRow.update(0, array.get(i)) + mutableRow.update(0, array.get(i, elementType)) consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) } } @@ -294,7 +294,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi consumeField(repeatedFieldName, 0) { var i = 0 while (i < array.numElements()) { - mutableRow.update(0, array.get(i)) + mutableRow.update(0, array.get(i, elementType)) elementWriter.apply(mutableRow, 0) i += 1 } @@ -305,24 +305,28 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = { - val keyWriter = makeWriter(mapType.keyType) - val valueWriter = makeWriter(mapType.valueType) - val mutableRow = new SpecificMutableRow(mapType.keyType :: mapType.valueType :: Nil) + val keyType = mapType.keyType + val valueType = mapType.valueType + val keyWriter = makeWriter(keyType) + val valueWriter = makeWriter(valueType) + val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) (row: InternalRow, ordinal: Int) => { consumeGroup { - val map = row.get(ordinal, mapType).asInstanceOf[Map[_, _]] - if (map.nonEmpty) { + val map = row.getMap(ordinal) + if (map.numElements() > 0) { consumeField(repeatedGroupName, 0) { - for ((key, value) <- map) { + var i = 0 + while (i < map.numElements()) { consumeGroup { - mutableRow.update(0, key) + mutableRow.update(0, map.keyArray().get(i, keyType)) consumeField("key", 0)(keyWriter.apply(mutableRow, 0)) - if (value != null) { - mutableRow.update(1, value) + if (!map.valueArray().isNullAt(i)) { + mutableRow.update(1, map.valueArray().get(i, valueType)) consumeField("value", 1)(valueWriter.apply(mutableRow, 1)) } } + i += 1 } } } From 5127b8d0fd8584e45b447a6e76ef58e572961a7e Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 00:04:24 +0800 Subject: [PATCH 15/21] Retrieves data from SpecializedGetters --- .../sql/parquet/CatalystWriteSupport.scala | 54 +++++++++---------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala index 0b28543744d93..553d05e8c6b76 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala @@ -32,14 +32,16 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.spark.Logging import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow +import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificMutableRow} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision} import org.apache.spark.sql.types._ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { - // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer - private type ValueWriter = (InternalRow, Int) => Unit + // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer. + // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access + // data in `ArrayData` without the help of `SpecificMutableRow`. + private type ValueWriter = (SpecializedGetters, Int) => Unit // Schema of the `InternalRow`s to be written private var schema: StructType = _ @@ -94,14 +96,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private def writeFields( row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = { var i = 0 - while (i < row.numFields) { if (!row.isNullAt(i)) { consumeField(schema(i).name, i) { fieldWriters(i).apply(row, i) } } - i += 1 } } @@ -109,40 +109,40 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private def makeWriter(dataType: DataType): ValueWriter = { dataType match { case BooleanType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addBoolean(row.getBoolean(ordinal)) case ByteType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getByte(ordinal)) case ShortType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getShort(ordinal)) case IntegerType | DateType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getInt(ordinal)) case LongType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addLong(row.getLong(ordinal)) case FloatType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addFloat(row.getFloat(ordinal)) case DoubleType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addDouble(row.getDouble(ordinal)) case StringType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes)) case TimestampType => // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { // Actually Spark SQL `TimestampType` only has microsecond precision. val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal)) val buf = ByteBuffer.wrap(timestampBuffer) @@ -151,7 +151,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } case BinaryType => - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal))) case DecimalType.Fixed(precision, scale) => @@ -159,7 +159,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi case t: StructType => val fieldWriters = t.map(_.dataType).map(makeWriter) - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)) case ArrayType(elementType, _) if followParquetFormatSpec => @@ -193,15 +193,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val numBytes = minBytesForPrecision(precision) val int32Writer = - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addInteger(row.getDecimal(ordinal, precision, scale).toUnscaledLong.toInt) val int64Writer = - (row: InternalRow, ordinal: Int) => + (row: SpecializedGetters, ordinal: Int) => recordConsumer.addLong(row.getDecimal(ordinal, precision, scale).toUnscaledLong) val binaryWriterUsingUnscaledLong = - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we // can build a fixed-length byte array with length `numBytes` using the unscaled `Long` // value and the `decimalBuffer` for better performance. @@ -219,7 +219,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } val binaryWriterUsingUnscaledBytes = - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { val decimal = row.getDecimal(ordinal, precision, scale) val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray val fixedLengthBytes = if (bytes.length == numBytes) { @@ -259,9 +259,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private def makeThreeLevelArrayWriter( elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = { val elementWriter = makeWriter(elementType) - val mutableRow = new SpecificMutableRow(elementType :: Nil) - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { consumeGroup { val array = row.getArray(ordinal) if (array.numElements() > 0) { @@ -270,8 +269,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi while (i < array.numElements()) { consumeGroup { if (!array.isNullAt(i)) { - mutableRow.update(0, array.get(i, elementType)) - consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0)) + consumeField(elementFieldName, 0)(elementWriter.apply(array, i)) } } i += 1 @@ -285,17 +283,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private def makeTwoLevelArrayWriter( elementType: DataType, repeatedFieldName: String): ValueWriter = { val elementWriter = makeWriter(elementType) - val mutableRow = new SpecificMutableRow(elementType :: Nil) - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { consumeGroup { val array = row.getArray(ordinal) if (array.numElements() > 0) { consumeField(repeatedFieldName, 0) { var i = 0 while (i < array.numElements()) { - mutableRow.update(0, array.get(i, elementType)) - elementWriter.apply(mutableRow, 0) + elementWriter.apply(array, i) i += 1 } } @@ -311,7 +307,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val valueWriter = makeWriter(valueType) val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) - (row: InternalRow, ordinal: Int) => { + (row: SpecializedGetters, ordinal: Int) => { consumeGroup { val map = row.getMap(ordinal) if (map.numElements() > 0) { From 1f1d4af761760ac99de8fdccae30a13851304d26 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 15:49:22 +0800 Subject: [PATCH 16/21] Renames "Catalyst*" classes to "Parquet*" --- ...Support.scala => ParquetReadSupport.scala} | 22 ++++++------- ....scala => ParquetRecordMaterializer.scala} | 4 +-- .../spark/sql/parquet/ParquetRelation.scala | 24 +++++++------- ...verter.scala => ParquetRowConverter.scala} | 32 +++++++++---------- ...ter.scala => ParquetSchemaConverter.scala} | 22 ++++++------- ...upport.scala => ParquetWriteSupport.scala} | 25 +++++++++------ .../spark/sql/parquet/ParquetIOSuite.scala | 4 +-- .../sql/parquet/ParquetSchemaSuite.scala | 4 +-- .../spark/sql/parquet/ParquetTest.scala | 4 +-- 9 files changed, 73 insertions(+), 68 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystReadSupport.scala => ParquetReadSupport.scala} (88%) rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystRecordMaterializer.scala => ParquetRecordMaterializer.scala} (90%) rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystRowConverter.scala => ParquetRowConverter.scala} (93%) rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystSchemaConverter.scala => ParquetSchemaConverter.scala} (97%) rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystWriteSupport.scala => ParquetWriteSupport.scala} (92%) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala similarity index 88% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala index 9648035744c1d..b67d15264c415 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala @@ -31,7 +31,7 @@ import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType -private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging { +private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with Logging { override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], @@ -39,24 +39,24 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with readContext: ReadContext): RecordMaterializer[InternalRow] = { log.debug(s"Preparing for read Parquet file with message type: $fileSchema") - val toCatalyst = new CatalystSchemaConverter(conf) + val toCatalyst = new ParquetSchemaConverter(conf) val parquetRequestedSchema = readContext.getRequestedSchema val catalystRequestedSchema = Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata => metadata // First tries to read requested schema, which may result from projections - .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA) + .get(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA) // If not available, tries to read Catalyst schema from file metadata. It's only // available if the target file is written by Spark SQL. - .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY)) + .orElse(metadata.get(ParquetReadSupport.SPARK_METADATA_KEY)) }.map(StructType.fromString).getOrElse { logDebug("Catalyst schema not available, falling back to Parquet schema") toCatalyst.convert(parquetRequestedSchema) } logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema") - new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema) + new ParquetRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema) } override def init(context: InitContext): ReadContext = { @@ -64,11 +64,11 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst // schema of this file from its the metadata. - val maybeRowSchema = Option(conf.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)) + val maybeRowSchema = Option(conf.get(ParquetWriteSupport.SPARK_ROW_SCHEMA)) // Optional schema of requested columns, in the form of a string serialized from a Catalyst // `StructType` containing all requested columns. - val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)) + val maybeRequestedSchema = Option(conf.get(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA)) // Below we construct a Parquet schema containing all requested columns. This schema tells // Parquet which columns to read. @@ -110,7 +110,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with // different physical structures. val parquetRequestedSchema = maybeRequestedSchema.fold(context.getFileSchema) { schemaString => - val toParquet = new CatalystSchemaConverter(conf) + val toParquet = new ParquetSchemaConverter(conf) val fileSchema = context.getFileSchema.asGroupType() val fileFieldNames = fileSchema.getFields.map(_.getName).toSet @@ -138,15 +138,15 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with val metadata = Map.empty[String, String] ++ - maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ - maybeRowSchema.map(CatalystWriteSupport.SPARK_ROW_SCHEMA -> _) + maybeRequestedSchema.map(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++ + maybeRowSchema.map(ParquetWriteSupport.SPARK_ROW_SCHEMA -> _) logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema") new ReadContext(parquetRequestedSchema, metadata) } } -private[parquet] object CatalystReadSupport { +private[parquet] object ParquetReadSupport { val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema" val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala similarity index 90% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala index 84f1dccfeb788..8c1971dbe4d65 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala @@ -29,11 +29,11 @@ import org.apache.spark.sql.types.StructType * @param parquetSchema Parquet schema of the records to be read * @param catalystSchema Catalyst schema of the rows to be constructed */ -private[parquet] class CatalystRecordMaterializer( +private[parquet] class ParquetRecordMaterializer( parquetSchema: MessageType, catalystSchema: StructType) extends RecordMaterializer[InternalRow] { - private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater) + private val rootConverter = new ParquetRowConverter(parquetSchema, catalystSchema, NoopUpdater) override def getCurrentRecord: InternalRow = rootConverter.currentRow diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index b7202375dc128..f2e0f78da4433 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -228,8 +228,8 @@ private[sql] class ParquetRelation( // bundled with `ParquetOutputFormat[Row]`. job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]]) - ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport]) - CatalystWriteSupport.setSchema(dataSchema, conf) + ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport]) + ParquetWriteSupport.setSchema(dataSchema, conf) // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema) conf.set( @@ -474,7 +474,7 @@ private[sql] object ParquetRelation extends Logging { assumeBinaryIsString: Boolean, assumeInt96IsTimestamp: Boolean)(job: Job): Unit = { val conf = job.getConfiguration - conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName) + conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName) // Try to push down filters when filter push-down is enabled. if (parquetFilterPushDown) { @@ -487,14 +487,14 @@ private[sql] object ParquetRelation extends Logging { .foreach(ParquetInputFormat.setFilterPredicate(conf, _)) } - conf.set(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA, { + conf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, { val requestedSchema = StructType(requiredColumns.map(dataSchema(_))) - CatalystSchemaConverter.checkFieldNames(requestedSchema).json + ParquetSchemaConverter.checkFieldNames(requestedSchema).json }) conf.set( - CatalystWriteSupport.SPARK_ROW_SCHEMA, - CatalystSchemaConverter.checkFieldNames(dataSchema).json) + ParquetWriteSupport.SPARK_ROW_SCHEMA, + ParquetSchemaConverter.checkFieldNames(dataSchema).json) // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache) @@ -518,7 +518,7 @@ private[sql] object ParquetRelation extends Logging { footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = { def parseParquetSchema(schema: MessageType): StructType = { - val converter = new CatalystSchemaConverter( + val converter = new ParquetSchemaConverter( sqlContext.conf.isParquetBinaryAsString, sqlContext.conf.isParquetBinaryAsString, sqlContext.conf.followParquetFormatSpec) @@ -532,7 +532,7 @@ private[sql] object ParquetRelation extends Logging { val serializedSchema = metadata .getKeyValueMetaData .toMap - .get(CatalystReadSupport.SPARK_METADATA_KEY) + .get(ParquetReadSupport.SPARK_METADATA_KEY) if (serializedSchema.isEmpty) { // Falls back to Parquet schema if no Spark SQL schema found. Some(parseParquetSchema(metadata.getSchema)) @@ -692,7 +692,7 @@ private[sql] object ParquetRelation extends Logging { // Converter used to convert Parquet `MessageType` to Spark SQL `StructType` val converter = - new CatalystSchemaConverter( + new ParquetSchemaConverter( assumeBinaryIsString = assumeBinaryIsString, assumeInt96IsTimestamp = assumeInt96IsTimestamp, followParquetFormatSpec = followParquetFormatSpec) @@ -711,12 +711,12 @@ private[sql] object ParquetRelation extends Logging { * a [[StructType]] converted from the [[MessageType]] stored in this footer. */ def readSchemaFromFooter( - footer: Footer, converter: CatalystSchemaConverter): StructType = { + footer: Footer, converter: ParquetSchemaConverter): StructType = { val fileMetaData = footer.getParquetMetadata.getFileMetaData fileMetaData .getKeyValueMetaData .toMap - .get(CatalystReadSupport.SPARK_METADATA_KEY) + .get(ParquetReadSupport.SPARK_METADATA_KEY) .flatMap(deserializeSchemaString) .getOrElse(converter.convert(fileMetaData.getSchema)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala similarity index 93% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala index f294e12a8f005..48ab2f3848675 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala @@ -56,7 +56,7 @@ private[parquet] trait ParentContainerUpdater { private[parquet] object NoopUpdater extends ParentContainerUpdater /** - * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[InternalRow]]s. + * A [[ParquetRowConverter]] is used to convert Parquet records into Spark SQL [[InternalRow]]s. * Since any Parquet record is also a struct, this converter can also be used as root converter. * * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have @@ -66,14 +66,14 @@ private[parquet] object NoopUpdater extends ParentContainerUpdater * @param catalystType Spark SQL schema that corresponds to the Parquet record type * @param updater An updater which propagates converted field values to the parent container */ -private[parquet] class CatalystRowConverter( +private[parquet] class ParquetRowConverter( parquetType: GroupType, catalystType: StructType, updater: ParentContainerUpdater) extends GroupConverter { /** - * Updater used together with field converters within a [[CatalystRowConverter]]. It propagates + * Updater used together with field converters within a [[ParquetRowConverter]]. It propagates * converted filed values to the `ordinal`-th cell in `currentRow`. */ private final class RowUpdater(row: MutableRow, ordinal: Int) extends ParentContainerUpdater { @@ -126,7 +126,7 @@ private[parquet] class CatalystRowConverter( catalystType match { case BooleanType | IntegerType | LongType | FloatType | DoubleType | BinaryType => - new CatalystPrimitiveConverter(updater) + new ParquetPrimitiveConverter(updater) case ByteType => new PrimitiveConverter { @@ -141,10 +141,10 @@ private[parquet] class CatalystRowConverter( } case t: DecimalType => - new CatalystDecimalConverter(t, updater) + new ParquetDecimalConverter(t, updater) case StringType => - new CatalystStringConverter(updater) + new ParquetStringConverter(updater) case TimestampType => // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that. @@ -172,13 +172,13 @@ private[parquet] class CatalystRowConverter( } case t: ArrayType => - new CatalystArrayConverter(parquetType.asGroupType(), t, updater) + new ParquetArrayConverter(parquetType.asGroupType(), t, updater) case t: MapType => - new CatalystMapConverter(parquetType.asGroupType(), t, updater) + new ParquetMapConverter(parquetType.asGroupType(), t, updater) case t: StructType => - new CatalystRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater { + new ParquetRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater { override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy()) }) @@ -186,7 +186,7 @@ private[parquet] class CatalystRowConverter( val catalystTypeForUDT = t.sqlType val nullable = parquetType.isRepetition(Repetition.OPTIONAL) val field = StructField("udt", catalystTypeForUDT, nullable) - val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field) + val parquetTypeForUDT = new ParquetSchemaConverter().convertField(field) newConverter(parquetTypeForUDT, catalystTypeForUDT, updater) case _ => @@ -200,7 +200,7 @@ private[parquet] class CatalystRowConverter( * are handled by this converter. Parquet primitive types are only a subset of those of Spark * SQL. For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet. */ - private final class CatalystPrimitiveConverter(updater: ParentContainerUpdater) + private final class ParquetPrimitiveConverter(updater: ParentContainerUpdater) extends PrimitiveConverter { override def addBoolean(value: Boolean): Unit = updater.setBoolean(value) @@ -214,7 +214,7 @@ private[parquet] class CatalystRowConverter( /** * Parquet converter for strings. A dictionary is used to minimize string decoding cost. */ - private final class CatalystStringConverter(updater: ParentContainerUpdater) + private final class ParquetStringConverter(updater: ParentContainerUpdater) extends PrimitiveConverter { private var expandedDictionary: Array[UTF8String] = null @@ -239,7 +239,7 @@ private[parquet] class CatalystRowConverter( /** * Parquet converter for fixed-precision decimals. */ - private final class CatalystDecimalConverter( + private final class ParquetDecimalConverter( decimalType: DecimalType, updater: ParentContainerUpdater) extends PrimitiveConverter { @@ -278,7 +278,7 @@ private[parquet] class CatalystRowConverter( (unscaled << (64 - bits)) >> (64 - bits) } - if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) { + if (precision <= ParquetSchemaConverter.MAX_PRECISION_FOR_INT64) { // Constructs a `Decimal` with an unscaled `Long` value if possible. val unscaled = bytesToUnscaledLong(bytes) Decimal(unscaled, precision, scale) @@ -308,7 +308,7 @@ private[parquet] class CatalystRowConverter( * * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists */ - private final class CatalystArrayConverter( + private final class ParquetArrayConverter( parquetSchema: GroupType, catalystSchema: ArrayType, updater: ParentContainerUpdater) @@ -385,7 +385,7 @@ private[parquet] class CatalystRowConverter( } /** Parquet converter for maps */ - private final class CatalystMapConverter( + private final class ParquetMapConverter( parquetType: GroupType, catalystType: MapType, updater: ParentContainerUpdater) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala similarity index 97% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala index acfa1b1ab6443..5c5c1f00f75b4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala @@ -25,7 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.parquet.schema.Type.Repetition._ import org.apache.parquet.schema._ -import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision} +import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision} import org.apache.spark.sql.types._ import org.apache.spark.sql.{AnalysisException, SQLConf} @@ -54,7 +54,7 @@ import org.apache.spark.sql.{AnalysisException, SQLConf} * backwards-compatible with these settings. If this argument is set to `false`, we fallback * to old style non-standard behaviors. */ -private[parquet] class CatalystSchemaConverter( +private[parquet] class ParquetSchemaConverter( private val assumeBinaryIsString: Boolean, private val assumeInt96IsTimestamp: Boolean, private val followParquetFormatSpec: Boolean) { @@ -136,7 +136,7 @@ private[parquet] class CatalystSchemaConverter( val precision = field.getDecimalMetadata.getPrecision val scale = field.getDecimalMetadata.getScale - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( maxPrecision == -1 || 1 <= precision && precision <= maxPrecision, s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)") @@ -170,7 +170,7 @@ private[parquet] class CatalystSchemaConverter( } case INT96 => - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( assumeInt96IsTimestamp, "INT96 is not supported unless it's interpreted as timestamp. " + s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.") @@ -212,11 +212,11 @@ private[parquet] class CatalystSchemaConverter( // // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists case LIST => - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( field.getFieldCount == 1, s"Invalid list type $field") val repeatedType = field.getType(0) - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( repeatedType.isRepetition(REPEATED), s"Invalid list type $field") if (isElementType(repeatedType, field.getName)) { @@ -232,17 +232,17 @@ private[parquet] class CatalystSchemaConverter( // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1 // scalastyle:on case MAP | MAP_KEY_VALUE => - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( field.getFieldCount == 1 && !field.getType(0).isPrimitive, s"Invalid map type: $field") val keyValueType = field.getType(0).asGroupType() - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2, s"Invalid map type: $field") val keyType = keyValueType.getType(0) - CatalystSchemaConverter.analysisRequire( + ParquetSchemaConverter.analysisRequire( keyType.isPrimitive, s"Map key type is expected to be a primitive type, but found: $keyType") @@ -325,7 +325,7 @@ private[parquet] class CatalystSchemaConverter( } private def convertField(field: StructField, repetition: Type.Repetition): Type = { - CatalystSchemaConverter.checkFieldName(field.name) + ParquetSchemaConverter.checkFieldName(field.name) field.dataType match { // =================== @@ -540,7 +540,7 @@ private[parquet] class CatalystSchemaConverter( } -private[parquet] object CatalystSchemaConverter { +private[parquet] object ParquetSchemaConverter { def checkFieldName(name: String): Unit = { // ,;{}()\n\t= and space are special characters in Parquet schema analysisRequire( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala similarity index 92% rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala index 553d05e8c6b76..49b35998ca53e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala @@ -34,10 +34,10 @@ import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificMutableRow} import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision} +import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision} import org.apache.spark.sql.types._ -private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { +private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer. // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access // data in `ArrayData` without the help of `SpecificMutableRow`. @@ -62,7 +62,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION)) override def init(configuration: Configuration): WriteContext = { - val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA) + val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA) schema = StructType.fromString(schemaString) rootFieldWriters = schema.map(_.dataType).map(makeWriter) @@ -72,8 +72,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get) - val messageType = new CatalystSchemaConverter(configuration).convert(schema) - val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava + val messageType = new ParquetSchemaConverter(configuration).convert(schema) + val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava logDebug( s"""Initialized Parquet WriteSupport with Catalyst schema: @@ -261,13 +261,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val elementWriter = makeWriter(elementType) (row: SpecializedGetters, ordinal: Int) => { + val array = row.getArray(ordinal) consumeGroup { - val array = row.getArray(ordinal) + // Only creates the repeated field if the array is non-empty. if (array.numElements() > 0) { consumeField(repeatedGroupName, 0) { var i = 0 while (i < array.numElements()) { consumeGroup { + // Only creates the element field if the current array element is not null. if (!array.isNullAt(i)) { consumeField(elementFieldName, 0)(elementWriter.apply(array, i)) } @@ -285,8 +287,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val elementWriter = makeWriter(elementType) (row: SpecializedGetters, ordinal: Int) => { + val array = row.getArray(ordinal) consumeGroup { - val array = row.getArray(ordinal) + // Only creates the repeated field if the array is non-empty. if (array.numElements() > 0) { consumeField(repeatedFieldName, 0) { var i = 0 @@ -308,8 +311,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) (row: SpecializedGetters, ordinal: Int) => { + val map = row.getMap(ordinal) consumeGroup { - val map = row.getMap(ordinal) + // Only creates the repeated field if the map is non-empty. if (map.numElements() > 0) { consumeField(repeatedGroupName, 0) { var i = 0 @@ -317,6 +321,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi consumeGroup { mutableRow.update(0, map.keyArray().get(i, keyType)) consumeField("key", 0)(keyWriter.apply(mutableRow, 0)) + // Only creates the "value" field if the value if non-empty if (!map.valueArray().isNullAt(i)) { mutableRow.update(1, map.valueArray().get(i, valueType)) consumeField("value", 1)(valueWriter.apply(mutableRow, 1)) @@ -349,11 +354,11 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi } } -private[parquet] object CatalystWriteSupport { +private[parquet] object ParquetWriteSupport { val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes" def setSchema(schema: StructType, configuration: Configuration): Unit = { - schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName) + schema.map(_.name).foreach(ParquetSchemaConverter.checkFieldName) configuration.set(SPARK_ROW_SCHEMA, schema.json) configuration.set( ParquetOutputFormat.WRITER_VERSION, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala index 24ab233d22cb9..f340277a90873 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala @@ -282,7 +282,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))) assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE))) - val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema) + val expectedSchema = new ParquetSchemaConverter(configuration).convert(schema) val actualSchema = readFooter(path, configuration).getFileMetaData.getSchema actualSchema.checkContains(expectedSchema) @@ -346,7 +346,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest { """.stripMargin) withTempPath { location => - val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString) + val extraMetadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString) val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark") val path = new Path(location.getCanonicalPath) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index ec6e37cd5ee85..2101fba4ff07d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -58,7 +58,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { int96AsTimestamp: Boolean = true, followParquetFormatSpec: Boolean = false, isThriftDerived: Boolean = false): Unit = { - val converter = new CatalystSchemaConverter( + val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, followParquetFormatSpec = followParquetFormatSpec) @@ -83,7 +83,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { int96AsTimestamp: Boolean = true, followParquetFormatSpec: Boolean = false, isThriftDerived: Boolean = false): Unit = { - val converter = new CatalystSchemaConverter( + val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, followParquetFormatSpec = followParquetFormatSpec) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala index f5dc9051a2cc2..58cf28794143a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala @@ -107,8 +107,8 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite => } def writeMetadata(schema: StructType, path: Path, configuration: Configuration): Unit = { - val parquetSchema = new CatalystSchemaConverter(configuration).convert(schema) - val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava + val parquetSchema = new ParquetSchemaConverter(configuration).convert(schema) + val extraMetadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schema.json).asJava val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}" val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy) val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava) From 0395e9505942647ce8e4e282223ae7fcdde872a3 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 15:53:20 +0800 Subject: [PATCH 17/21] Renames root Parquet message name --- .../org/apache/spark/sql/parquet/ParquetReadSupport.scala | 6 ++++-- .../apache/spark/sql/parquet/ParquetSchemaConverter.scala | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala index b67d15264c415..46418a4a8dcdb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala @@ -121,7 +121,9 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with if (fileFieldNames.contains(field.name)) { // If the field exists in the target Parquet file, extracts the field type from the // full file schema and makes a single-field Parquet schema - new MessageType("root", fileSchema.getType(field.name)) + new MessageType( + ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME, + fileSchema.getType(field.name)) } else { // Otherwise, just resorts to `CatalystSchemaConverter` toParquet.convert(StructType(Array(field))) @@ -131,7 +133,7 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with // columns. Note that it's possible that no columns are requested at all (e.g., count // some partition column of a partitioned Parquet table). That's why `fold` is used here // and always fallback to an empty Parquet schema. - .fold(new MessageType("root")) { + .fold(new MessageType(ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)) { _ union _ } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala index 5c5c1f00f75b4..7495fce40e241 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala @@ -314,7 +314,10 @@ private[parquet] class ParquetSchemaConverter( * Converts a Spark SQL [[StructType]] to a Parquet [[MessageType]]. */ def convert(catalystSchema: StructType): MessageType = { - Types.buildMessage().addFields(catalystSchema.map(convertField): _*).named("root") + Types + .buildMessage() + .addFields(catalystSchema.map(convertField): _*) + .named(ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME) } /** @@ -541,6 +544,8 @@ private[parquet] class ParquetSchemaConverter( private[parquet] object ParquetSchemaConverter { + val SPARK_PARQUET_SCHEMA_NAME = "spark_schema" + def checkFieldName(name: String): Unit = { // ,;{}()\n\t= and space are special characters in Parquet schema analysisRequire( From 23e523d26e91d58be297a898694e359b584b7c6f Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 16:06:26 +0800 Subject: [PATCH 18/21] Makes implicit arguments in ParquetSchemaSuite explicit --- .../sql/parquet/ParquetSchemaSuite.scala | 257 +++++++++++++----- 1 file changed, 185 insertions(+), 72 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 2101fba4ff07d..84c60f6a9a039 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -36,28 +36,25 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { protected def testSchemaInference[T <: Product: ClassTag: TypeTag]( testName: String, messageType: String, - binaryAsString: Boolean = true, - int96AsTimestamp: Boolean = true, - followParquetFormatSpec: Boolean = false, - isThriftDerived: Boolean = false): Unit = { + binaryAsString: Boolean, + int96AsTimestamp: Boolean, + followParquetFormatSpec: Boolean): Unit = { testSchema( testName, StructType.fromAttributes(ScalaReflection.attributesFor[T]), messageType, binaryAsString, int96AsTimestamp, - followParquetFormatSpec, - isThriftDerived) + followParquetFormatSpec) } protected def testParquetToCatalyst( testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean = true, - int96AsTimestamp: Boolean = true, - followParquetFormatSpec: Boolean = false, - isThriftDerived: Boolean = false): Unit = { + binaryAsString: Boolean, + int96AsTimestamp: Boolean, + followParquetFormatSpec: Boolean): Unit = { val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, @@ -79,10 +76,9 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean = true, - int96AsTimestamp: Boolean = true, - followParquetFormatSpec: Boolean = false, - isThriftDerived: Boolean = false): Unit = { + binaryAsString: Boolean, + int96AsTimestamp: Boolean, + followParquetFormatSpec: Boolean): Unit = { val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, @@ -100,10 +96,9 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { testName: String, sqlSchema: StructType, parquetSchema: String, - binaryAsString: Boolean = true, - int96AsTimestamp: Boolean = true, - followParquetFormatSpec: Boolean = false, - isThriftDerived: Boolean = false): Unit = { + binaryAsString: Boolean, + int96AsTimestamp: Boolean, + followParquetFormatSpec: Boolean): Unit = { testCatalystToParquet( testName, @@ -111,8 +106,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema, binaryAsString, int96AsTimestamp, - followParquetFormatSpec, - isThriftDerived) + followParquetFormatSpec) testParquetToCatalyst( testName, @@ -120,8 +114,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema, binaryAsString, int96AsTimestamp, - followParquetFormatSpec, - isThriftDerived) + followParquetFormatSpec) } } @@ -138,7 +131,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _6; |} """.stripMargin, - binaryAsString = false) + binaryAsString = false, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)]( "logical integral types", @@ -150,7 +145,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | required int64 _4 (INT_64); | optional int32 _5 (DATE); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[String]]( "string", @@ -159,7 +157,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | optional binary _1 (UTF8); |} """.stripMargin, - binaryAsString = true) + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[String]]( "binary enum as string", @@ -167,7 +167,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional binary _1 (ENUM); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - non-standard", @@ -177,7 +180,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | repeated int32 array; | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - standard", @@ -190,6 +196,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchemaInference[Tuple1[Seq[Integer]]]( @@ -202,7 +210,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[Seq[Integer]]]( "nullable array - standard", @@ -215,6 +226,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, String]]]( @@ -229,6 +242,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, String]]]( @@ -242,7 +257,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[Pair[Int, String]]]( "struct", @@ -254,6 +272,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( @@ -277,7 +297,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( "deeply nested type - standard", @@ -301,6 +324,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchemaInference[(Option[Int], Map[Int, Option[Double]])]( @@ -316,6 +341,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) // Parquet files generated by parquet-thrift are already handled by the schema converter, but @@ -325,26 +352,28 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { // as expected from attributes testSchemaInference[( Array[Byte], Array[Byte], Array[Byte], Seq[Int], Map[Array[Byte], Seq[Int]])]( - "thrift generated parquet schema", - """ - |message root { - | optional binary _1 (UTF8); - | optional binary _2 (UTF8); - | optional binary _3 (UTF8); - | optional group _4 (LIST) { - | repeated int32 _4_tuple; - | } - | optional group _5 (MAP) { - | repeated group map (MAP_KEY_VALUE) { - | required binary key (UTF8); - | optional group value (LIST) { - | repeated int32 value_tuple; - | } - | } - | } - |} - """.stripMargin, - isThriftDerived = true) + "thrift generated parquet schema", + """ + |message root { + | optional binary _1 (UTF8); + | optional binary _2 (UTF8); + | optional binary _3 (UTF8); + | optional group _4 (LIST) { + | repeated int32 _4_tuple; + | } + | optional group _5 (MAP) { + | repeated group map (MAP_KEY_VALUE) { + | required binary key (UTF8); + | optional group value (LIST) { + | repeated int32 value_tuple; + | } + | } + | } + |} + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) } } @@ -471,7 +500,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with nullable element type - 2", @@ -487,7 +519,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -500,7 +535,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 2", @@ -513,7 +551,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 3", @@ -524,7 +565,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | repeated int32 element; | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 4", @@ -545,7 +589,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style", @@ -564,7 +611,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style", @@ -583,7 +633,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) // ======================================================= // Tests for converting Catalyst ArrayType to Parquet LIST @@ -604,6 +657,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testCatalystToParquet( @@ -620,7 +675,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testCatalystToParquet( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -637,6 +695,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testCatalystToParquet( @@ -651,7 +711,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | repeated int32 array; | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) // ==================================================== // Tests for converting Parquet Map to Catalyst MapType @@ -672,7 +735,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 2", @@ -689,7 +755,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x", @@ -706,7 +775,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -723,7 +795,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 2", @@ -740,7 +815,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style", @@ -757,7 +835,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) // ==================================================== // Tests for converting Catalyst MapType to Parquet Map @@ -779,6 +860,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testCatalystToParquet( @@ -796,7 +879,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testCatalystToParquet( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -814,6 +900,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testCatalystToParquet( @@ -831,7 +919,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | } | } |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) // ================================= // Tests for conversion for decimals @@ -844,6 +935,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(1, 0)); |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchema( @@ -853,6 +946,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(8, 3)); |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchema( @@ -862,6 +957,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int32 f1 (DECIMAL(9, 3)); |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchema( @@ -871,6 +968,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional int64 f1 (DECIMAL(18, 3)); |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchema( @@ -880,6 +979,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | optional fixed_len_byte_array(9) f1 (DECIMAL(19, 3)); |} """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, followParquetFormatSpec = true) testSchema( @@ -888,7 +989,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0)); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchema( "DECIMAL(8, 3) - prior to 1.4.x", @@ -896,7 +1000,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3)); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchema( "DECIMAL(9, 3) - prior to 1.4.x", @@ -904,7 +1011,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3)); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) testSchema( "DECIMAL(18, 3) - prior to 1.4.x", @@ -912,5 +1022,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """message root { | optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3)); |} - """.stripMargin) + """.stripMargin, + binaryAsString = true, + int96AsTimestamp = true, + followParquetFormatSpec = false) } From 66419b7e019d94c41b6439287df0cf5a42cb4e15 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 17:08:05 +0800 Subject: [PATCH 19/21] Renames followParquetFormatSpec to writeLegacyParquetFormat (its meaning is flipped) --- .../scala/org/apache/spark/sql/SQLConf.scala | 12 +- .../spark/sql/parquet/ParquetRelation.scala | 10 +- .../sql/parquet/ParquetSchemaConverter.scala | 66 +++---- .../sql/parquet/ParquetWriteSupport.scala | 185 ++++++++++++------ .../sql/parquet/ParquetSchemaSuite.scala | 110 +++++------ .../apache/spark/sql/hive/parquetSuites.scala | 2 +- 6 files changed, 220 insertions(+), 165 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index 6644e85d4a037..290ce472c6d96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -283,12 +283,12 @@ private[spark] object SQLConf { defaultValue = Some(true), doc = "Enables Parquet filter push-down optimization when set to true.") - val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf( - key = "spark.sql.parquet.followParquetFormatSpec", + val PARQUET_WRITE_LEGACY_FORMAT = booleanConf( + key = "spark.sql.parquet.writeLegacyParquetFormat", defaultValue = Some(false), - doc = "Whether to follow Parquet's format specification when converting Parquet schema to " + - "Spark SQL schema and vice versa.", - isPublic = false) + doc = "When true, writes Parquet data in legacy format compatible with Spark 1.4.0 and prior " + + "versions, instead of the standard one defined in parquet-format spec.", + isPublic = true) val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf( key = "spark.sql.parquet.output.committer.class", @@ -493,7 +493,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf { private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP) - private[spark] def followParquetFormatSpec: Boolean = getConf(PARQUET_FOLLOW_PARQUET_FORMAT_SPEC) + private[spark] def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT) private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala index f2e0f78da4433..af54851346992 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala @@ -233,8 +233,8 @@ private[sql] class ParquetRelation( // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema) conf.set( - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, - sqlContext.conf.followParquetFormatSpec.toString) + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, + sqlContext.conf.writeLegacyParquetFormat.toString) // Sets compression scheme conf.set( @@ -521,7 +521,7 @@ private[sql] object ParquetRelation extends Logging { val converter = new ParquetSchemaConverter( sqlContext.conf.isParquetBinaryAsString, sqlContext.conf.isParquetBinaryAsString, - sqlContext.conf.followParquetFormatSpec) + sqlContext.conf.writeLegacyParquetFormat) converter.convert(schema) } @@ -655,7 +655,7 @@ private[sql] object ParquetRelation extends Logging { filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = { val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp - val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec + val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration) // HACK ALERT: @@ -695,7 +695,7 @@ private[sql] object ParquetRelation extends Logging { new ParquetSchemaConverter( assumeBinaryIsString = assumeBinaryIsString, assumeInt96IsTimestamp = assumeInt96IsTimestamp, - followParquetFormatSpec = followParquetFormatSpec) + writeLegacyParquetFormat = writeLegacyParquetFormat) footers.map { footer => ParquetRelation.readSchemaFromFooter(footer, converter) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala index 7495fce40e241..80656b04ed7ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala @@ -41,35 +41,33 @@ import org.apache.spark.sql.{AnalysisException, SQLConf} * @constructor * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL * [[StringType]] fields when converting Parquet a [[MessageType]] to Spark SQL - * [[StructType]]. + * [[StructType]]. This argument only affects Parquet read path. * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL * [[TimestampType]] fields when converting Parquet a [[MessageType]] to Spark SQL * [[StructType]]. Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which * has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS` - * described in Parquet format spec. - * @param followParquetFormatSpec Whether to generate standard DECIMAL, LIST, and MAP structure when - * converting Spark SQL [[StructType]] to Parquet [[MessageType]]. For Spark 1.4.x and - * prior versions, Spark SQL only supports decimals with a max precision of 18 digits, and - * uses non-standard LIST and MAP structure. Note that the current Parquet format spec is - * backwards-compatible with these settings. If this argument is set to `false`, we fallback - * to old style non-standard behaviors. + * described in Parquet format spec. This argument only affects Parquet read path. + * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4 + * and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]]. + * When set to false, use standard format defined in parquet-format spec. This argument only + * affects Parquet write path. */ private[parquet] class ParquetSchemaConverter( private val assumeBinaryIsString: Boolean, private val assumeInt96IsTimestamp: Boolean, - private val followParquetFormatSpec: Boolean) { + private val writeLegacyParquetFormat: Boolean) { // Only used when constructing converter for converting Spark SQL schema to Parquet schema, in // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are irrelevant. def this() = this( assumeBinaryIsString = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, assumeInt96IsTimestamp = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, - followParquetFormatSpec = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get) + writeLegacyParquetFormat = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get) def this(conf: SQLConf) = this( assumeBinaryIsString = conf.isParquetBinaryAsString, assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp, - followParquetFormatSpec = conf.followParquetFormatSpec) + writeLegacyParquetFormat = conf.writeLegacyParquetFormat) def this(conf: Configuration) = this( assumeBinaryIsString = @@ -80,10 +78,10 @@ private[parquet] class ParquetSchemaConverter( conf.getBoolean( SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get), - followParquetFormatSpec = + writeLegacyParquetFormat = conf.getBoolean( - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)) + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get)) /** * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]]. @@ -386,15 +384,15 @@ private[parquet] class ParquetSchemaConverter( case BinaryType => Types.primitive(BINARY, repetition).named(field.name) - // ===================================== - // Decimals (for Spark version <= 1.4.x) - // ===================================== + // ====================== + // Decimals (legacy mode) + // ====================== // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and // always store decimals in fixed-length byte arrays. To keep compatibility with these older // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated // by `DECIMAL`. - case DecimalType.Fixed(precision, scale) if !followParquetFormatSpec => + case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat => Types .primitive(FIXED_LEN_BYTE_ARRAY, repetition) .as(DECIMAL) @@ -403,13 +401,13 @@ private[parquet] class ParquetSchemaConverter( .length(minBytesForPrecision(precision)) .named(field.name) - // ===================================== - // Decimals (follow Parquet format spec) - // ===================================== + // ======================== + // Decimals (standard mode) + // ======================== // Uses INT32 for 1 <= precision <= 9 case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat => Types .primitive(INT32, repetition) .as(DECIMAL) @@ -419,7 +417,7 @@ private[parquet] class ParquetSchemaConverter( // Uses INT64 for 10 <= precision <= 18 case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec => + if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat => Types .primitive(INT64, repetition) .as(DECIMAL) @@ -428,7 +426,7 @@ private[parquet] class ParquetSchemaConverter( .named(field.name) // Uses FIXED_LEN_BYTE_ARRAY for all other precisions - case DecimalType.Fixed(precision, scale) if followParquetFormatSpec => + case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat => Types .primitive(FIXED_LEN_BYTE_ARRAY, repetition) .as(DECIMAL) @@ -437,16 +435,16 @@ private[parquet] class ParquetSchemaConverter( .length(minBytesForPrecision(precision)) .named(field.name) - // =================================================== - // ArrayType and MapType (for Spark versions <= 1.4.x) - // =================================================== + // =================================== + // ArrayType and MapType (legacy mode) + // =================================== // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level // LIST structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous // field name "array" is from parquet-avro. Note that this case is covered by the backwards- // compatibility rules implemented in `isElementType()`. - case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec => + case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat => // group (LIST) { // optional group bag { // repeated array; @@ -464,7 +462,7 @@ private[parquet] class ParquetSchemaConverter( // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is // covered by the backwards-compatibility rules implemented in `isElementType()`. - case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec => + case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat => // group (LIST) { // repeated array; // } @@ -476,7 +474,7 @@ private[parquet] class ParquetSchemaConverter( // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. - case MapType(keyType, valueType, valueContainsNull) if !followParquetFormatSpec => + case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat => // group (MAP) { // repeated group map (MAP_KEY_VALUE) { // required key; @@ -489,11 +487,11 @@ private[parquet] class ParquetSchemaConverter( convertField(StructField("key", keyType, nullable = false)), convertField(StructField("value", valueType, valueContainsNull))) - // ================================================== - // ArrayType and MapType (follow Parquet format spec) - // ================================================== + // ===================================== + // ArrayType and MapType (standard mode) + // ===================================== - case ArrayType(elementType, containsNull) if followParquetFormatSpec => + case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat => // group (LIST) { // repeated group list { // element; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala index 49b35998ca53e..0e1c17a37e650 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala @@ -37,6 +37,17 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision} import org.apache.spark.sql.types._ +/** + * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet + * messages. This class can write Parquet data in two modes: + * + * - Standard mode: Parquet data are written in standard format defined in parquet-format spec. + * - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior. + * + * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyParquetFormat`. The + * value of the option is propagated to this class by the `init()` method and its Hadoop + * configuration argument. + */ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging { // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer. // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access @@ -52,8 +63,8 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit // The Parquet `RecordConsumer` to which all `InternalRow`s are written private var recordConsumer: RecordConsumer = _ - // Whether we should write standard Parquet data conforming to parquet-format spec or not - private var followParquetFormatSpec: Boolean = _ + // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions + private var writeLegacyParquetFormat: Boolean = _ // Reusable byte array used to write timestamps as Parquet INT96 values private val timestampBuffer = new Array[Byte](12) @@ -65,12 +76,10 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA) schema = StructType.fromString(schemaString) rootFieldWriters = schema.map(_.dataType).map(makeWriter) - - assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null) - followParquetFormatSpec = - configuration.getBoolean( - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get) + writeLegacyParquetFormat = { + assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null) + configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean + } val messageType = new ParquetSchemaConverter(configuration).convert(schema) val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava @@ -141,9 +150,15 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes)) case TimestampType => - // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it (row: SpecializedGetters, ordinal: Int) => { - // Actually Spark SQL `TimestampType` only has microsecond precision. + // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it + // Currently we only support timestamps stored as INT96, which is compatible with Hive + // and Impala. However, INT96 is to be deprecated. We plan to support `TIMESTAMP_MICROS` + // defined in the parquet-format spec. But up until writing, the most recent parquet-mr + // version (1.8.1) hasn't implemented it yet. + + // NOTE: Starting from Spark 1.5, Spark SQL `TimestampType` only has microsecond + // precision. Nanosecond parts of timestamp values read from INT96 are simply stripped. val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal)) val buf = ByteBuffer.wrap(timestampBuffer) buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay) @@ -162,26 +177,14 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit (row: SpecializedGetters, ordinal: Int) => consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)) - case ArrayType(elementType, _) if followParquetFormatSpec => - makeThreeLevelArrayWriter(elementType, "list", "element") - - case ArrayType(elementType, true) if !followParquetFormatSpec => - makeThreeLevelArrayWriter(elementType, "bag", "array") - - case ArrayType(elementType, false) if !followParquetFormatSpec => - makeTwoLevelArrayWriter(elementType, "array") + case t: ArrayType => makeArrayWriter(t) - case t: MapType if followParquetFormatSpec => - makeMapWriter(t, "key_value") + case t: MapType => makeMapWriter(t) - case t: MapType if !followParquetFormatSpec => - makeMapWriter(t, "map") + case t: UserDefinedType[_] => makeWriter(t.sqlType) - case t: UserDefinedType[_] => - makeWriter(t.sqlType) - - case _ => - sys.error(s"Unsupported data type $dataType.") + // TODO Adds IntervalType support + case _ => sys.error(s"Unsupported data type $dataType.") } } @@ -239,15 +242,15 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes)) } - followParquetFormatSpec match { + writeLegacyParquetFormat match { // Standard mode, writes decimals with precision <= 9 as INT32 - case true if precision <= MAX_PRECISION_FOR_INT32 => int32Writer + case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer // Standard mode, writes decimals with precision <= 18 as INT64 - case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer + case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY - case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong + case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong // All other cases: // - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY @@ -256,59 +259,113 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit } } - private def makeThreeLevelArrayWriter( - elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = { - val elementWriter = makeWriter(elementType) + def makeArrayWriter(arrayType: ArrayType): ValueWriter = { + val elementWriter = makeWriter(arrayType.elementType) - (row: SpecializedGetters, ordinal: Int) => { - val array = row.getArray(ordinal) - consumeGroup { - // Only creates the repeated field if the array is non-empty. - if (array.numElements() > 0) { - consumeField(repeatedGroupName, 0) { - var i = 0 - while (i < array.numElements()) { - consumeGroup { - // Only creates the element field if the current array element is not null. - if (!array.isNullAt(i)) { - consumeField(elementFieldName, 0)(elementWriter.apply(array, i)) + def threeLevelArrayWriter(repeatedGroupName: String, elementFieldName: String): ValueWriter = + (row: SpecializedGetters, ordinal: Int) => { + val array = row.getArray(ordinal) + consumeGroup { + // Only creates the repeated field if the array is non-empty. + if (array.numElements() > 0) { + consumeField(repeatedGroupName, 0) { + var i = 0 + while (i < array.numElements()) { + consumeGroup { + // Only creates the element field if the current array element is not null. + if (!array.isNullAt(i)) { + consumeField(elementFieldName, 0)(elementWriter.apply(array, i)) + } } + i += 1 } - i += 1 } } } } - } - } - private def makeTwoLevelArrayWriter( - elementType: DataType, repeatedFieldName: String): ValueWriter = { - val elementWriter = makeWriter(elementType) - - (row: SpecializedGetters, ordinal: Int) => { - val array = row.getArray(ordinal) - consumeGroup { - // Only creates the repeated field if the array is non-empty. - if (array.numElements() > 0) { - consumeField(repeatedFieldName, 0) { - var i = 0 - while (i < array.numElements()) { - elementWriter.apply(array, i) - i += 1 + def twoLevelArrayWriter(repeatedFieldName: String): ValueWriter = + (row: SpecializedGetters, ordinal: Int) => { + val array = row.getArray(ordinal) + consumeGroup { + // Only creates the repeated field if the array is non-empty. + if (array.numElements() > 0) { + consumeField(repeatedFieldName, 0) { + var i = 0 + while (i < array.numElements()) { + elementWriter.apply(array, i) + i += 1 + } } } } } + + (writeLegacyParquetFormat, arrayType.containsNull) match { + case (false, _) => + // Standard mode: + // + // group (LIST) { + // repeated group list { + // ^~~~ repeatedGroupName + // element; + // ^~~~~~~ elementFieldName + // } + // } + threeLevelArrayWriter(repeatedGroupName = "list", elementFieldName = "element") + + case (true, true) => + // Legacy mode, with nullable elements: + // + // group (LIST) { + // optional group bag { + // ^~~ repeatedGroupName + // repeated array; + // ^~~~~ elementFieldName + // } + // } + threeLevelArrayWriter(repeatedGroupName = "bag", elementFieldName = "array") + + case (true, false) => + // Legacy mode, with non-nullable elements: + // + // group (LIST) { + // repeated array; + // ^~~~~ repeatedFieldName + // } + twoLevelArrayWriter(repeatedFieldName = "array") } } - private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = { + private def makeMapWriter(mapType: MapType): ValueWriter = { val keyType = mapType.keyType val valueType = mapType.valueType val keyWriter = makeWriter(keyType) val valueWriter = makeWriter(valueType) val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil) + val repeatedGroupName = if (writeLegacyParquetFormat) { + // Legacy mode: + // + // group (MAP) { + // repeated group map (MAP_KEY_VALUE) { + // ^~~ repeatedGroupName + // required key; + // value; + // } + // } + "map" + } else { + // Standard mode: + // + // group (MAP) { + // repeated group key_value { + // ^~~~~~~~~ repeatedGroupName + // required key; + // value; + // } + // } + "key_value" + } (row: SpecializedGetters, ordinal: Int) => { val map = row.getMap(ordinal) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 84c60f6a9a039..230983b243771 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -38,14 +38,14 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { messageType: String, binaryAsString: Boolean, int96AsTimestamp: Boolean, - followParquetFormatSpec: Boolean): Unit = { + writeLegacyParquetFormat: Boolean): Unit = { testSchema( testName, StructType.fromAttributes(ScalaReflection.attributesFor[T]), messageType, binaryAsString, int96AsTimestamp, - followParquetFormatSpec) + writeLegacyParquetFormat) } protected def testParquetToCatalyst( @@ -54,11 +54,11 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema: String, binaryAsString: Boolean, int96AsTimestamp: Boolean, - followParquetFormatSpec: Boolean): Unit = { + writeLegacyParquetFormat: Boolean): Unit = { val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, - followParquetFormatSpec = followParquetFormatSpec) + writeLegacyParquetFormat = writeLegacyParquetFormat) test(s"sql <= parquet: $testName") { val actual = converter.convert(MessageTypeParser.parseMessageType(parquetSchema)) @@ -78,11 +78,11 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema: String, binaryAsString: Boolean, int96AsTimestamp: Boolean, - followParquetFormatSpec: Boolean): Unit = { + writeLegacyParquetFormat: Boolean): Unit = { val converter = new ParquetSchemaConverter( assumeBinaryIsString = binaryAsString, assumeInt96IsTimestamp = int96AsTimestamp, - followParquetFormatSpec = followParquetFormatSpec) + writeLegacyParquetFormat = writeLegacyParquetFormat) test(s"sql => parquet: $testName") { val actual = converter.convert(sqlSchema) @@ -98,7 +98,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema: String, binaryAsString: Boolean, int96AsTimestamp: Boolean, - followParquetFormatSpec: Boolean): Unit = { + writeLegacyParquetFormat: Boolean): Unit = { testCatalystToParquet( testName, @@ -106,7 +106,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema, binaryAsString, int96AsTimestamp, - followParquetFormatSpec) + writeLegacyParquetFormat) testParquetToCatalyst( testName, @@ -114,7 +114,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest { parquetSchema, binaryAsString, int96AsTimestamp, - followParquetFormatSpec) + writeLegacyParquetFormat) } } @@ -133,7 +133,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = false, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)]( "logical integral types", @@ -148,7 +148,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[String]]( "string", @@ -159,7 +159,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[String]]( "binary enum as string", @@ -170,7 +170,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - non-standard", @@ -183,7 +183,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[Seq[Int]]]( "non-nullable array - standard", @@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchemaInference[Tuple1[Seq[Integer]]]( "nullable array - non-standard", @@ -213,7 +213,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[Seq[Integer]]]( "nullable array - standard", @@ -228,7 +228,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchemaInference[Tuple1[Map[Int, String]]]( "map - standard", @@ -244,7 +244,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchemaInference[Tuple1[Map[Int, String]]]( "map - non-standard", @@ -260,7 +260,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[Pair[Int, String]]]( "struct", @@ -274,7 +274,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( "deeply nested type - non-standard", @@ -300,7 +300,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]]( "deeply nested type - standard", @@ -326,7 +326,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchemaInference[(Option[Int], Map[Int, Option[Double]])]( "optional types", @@ -343,7 +343,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) // Parquet files generated by parquet-thrift are already handled by the schema converter, but // let's leave this test here until both read path and write path are all updated. @@ -373,7 +373,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) } } @@ -503,7 +503,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with nullable element type - 2", @@ -522,7 +522,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -538,7 +538,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 2", @@ -554,7 +554,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 3", @@ -568,7 +568,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 4", @@ -592,7 +592,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style", @@ -614,7 +614,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style", @@ -636,7 +636,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) // ======================================================= // Tests for converting Catalyst ArrayType to Parquet LIST @@ -659,7 +659,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testCatalystToParquet( "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x", @@ -678,7 +678,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testCatalystToParquet( "Backwards-compatibility: LIST with non-nullable element type - 1 - standard", @@ -697,7 +697,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testCatalystToParquet( "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x", @@ -714,7 +714,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) // ==================================================== // Tests for converting Parquet Map to Catalyst MapType @@ -738,7 +738,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 2", @@ -758,7 +758,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x", @@ -778,7 +778,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -798,7 +798,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 2", @@ -818,7 +818,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testParquetToCatalyst( "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style", @@ -838,7 +838,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) // ==================================================== // Tests for converting Catalyst MapType to Parquet Map @@ -862,7 +862,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testCatalystToParquet( "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x", @@ -882,7 +882,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testCatalystToParquet( "Backwards-compatibility: MAP with nullable value type - 1 - standard", @@ -902,7 +902,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testCatalystToParquet( "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x", @@ -922,7 +922,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) // ================================= // Tests for conversion for decimals @@ -937,7 +937,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchema( "DECIMAL(8, 3) - standard", @@ -948,7 +948,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchema( "DECIMAL(9, 3) - standard", @@ -959,7 +959,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchema( "DECIMAL(18, 3) - standard", @@ -970,7 +970,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchema( "DECIMAL(19, 3) - standard", @@ -981,7 +981,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = true) + writeLegacyParquetFormat = false) testSchema( "DECIMAL(1, 0) - prior to 1.4.x", @@ -992,7 +992,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchema( "DECIMAL(8, 3) - prior to 1.4.x", @@ -1003,7 +1003,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchema( "DECIMAL(9, 3) - prior to 1.4.x", @@ -1014,7 +1014,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) testSchema( "DECIMAL(18, 3) - prior to 1.4.x", @@ -1025,5 +1025,5 @@ class ParquetSchemaSuite extends ParquetSchemaTest { """.stripMargin, binaryAsString = true, int96AsTimestamp = true, - followParquetFormatSpec = false) + writeLegacyParquetFormat = true) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala index f56fb96c52d37..0cb9f048c9fcd 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala @@ -608,7 +608,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest { val conf = Seq( HiveContext.CONVERT_METASTORE_PARQUET.key -> "false", SQLConf.PARQUET_BINARY_AS_STRING.key -> "true", - SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key -> "true") + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") withSQLConf(conf: _*) { sql( From 6bda94ba32ebb61d089c5392f1afdbf5613b8549 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 2 Aug 2015 17:38:19 +0800 Subject: [PATCH 20/21] Renames analysisRequire to checkConversionRequirement --- .../sql/parquet/ParquetSchemaConverter.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala index 80656b04ed7ba..c5bab35aa421f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala @@ -134,7 +134,7 @@ private[parquet] class ParquetSchemaConverter( val precision = field.getDecimalMetadata.getPrecision val scale = field.getDecimalMetadata.getScale - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( maxPrecision == -1 || 1 <= precision && precision <= maxPrecision, s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)") @@ -168,7 +168,7 @@ private[parquet] class ParquetSchemaConverter( } case INT96 => - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( assumeInt96IsTimestamp, "INT96 is not supported unless it's interpreted as timestamp. " + s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.") @@ -210,11 +210,11 @@ private[parquet] class ParquetSchemaConverter( // // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists case LIST => - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( field.getFieldCount == 1, s"Invalid list type $field") val repeatedType = field.getType(0) - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( repeatedType.isRepetition(REPEATED), s"Invalid list type $field") if (isElementType(repeatedType, field.getName)) { @@ -230,17 +230,17 @@ private[parquet] class ParquetSchemaConverter( // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1 // scalastyle:on case MAP | MAP_KEY_VALUE => - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( field.getFieldCount == 1 && !field.getType(0).isPrimitive, s"Invalid map type: $field") val keyValueType = field.getType(0).asGroupType() - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2, s"Invalid map type: $field") val keyType = keyValueType.getType(0) - ParquetSchemaConverter.analysisRequire( + ParquetSchemaConverter.checkConversionRequirement( keyType.isPrimitive, s"Map key type is expected to be a primitive type, but found: $keyType") @@ -546,7 +546,7 @@ private[parquet] object ParquetSchemaConverter { def checkFieldName(name: String): Unit = { // ,;{}()\n\t= and space are special characters in Parquet schema - analysisRequire( + checkConversionRequirement( !name.matches(".*[ ,;{}()\n\t=].*"), s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=". |Please use alias to rename it. @@ -558,7 +558,7 @@ private[parquet] object ParquetSchemaConverter { schema } - def analysisRequire(f: => Boolean, message: String): Unit = { + def checkConversionRequirement(f: => Boolean, message: String): Unit = { if (!f) { throw new AnalysisException(message) } From 679888afa7f4128fb9bbbb021bd5ec06467f88ee Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Mon, 3 Aug 2015 18:58:41 +0800 Subject: [PATCH 21/21] Simplifies ParquetSchemaConverter and updates outdated comments --- .../sql/parquet/ParquetSchemaConverter.scala | 264 +++++++----------- .../sql/parquet/ParquetWriteSupport.scala | 10 +- 2 files changed, 109 insertions(+), 165 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala index c5bab35aa421f..26ca6b6cc5946 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala @@ -329,10 +329,6 @@ private[parquet] class ParquetSchemaConverter( ParquetSchemaConverter.checkFieldName(field.name) field.dataType match { - // =================== - // Simple atomic types - // =================== - case BooleanType => Types.primitive(BOOLEAN, repetition).named(field.name) @@ -363,173 +359,123 @@ private[parquet] class ParquetSchemaConverter( // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec. // // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond - // timestamp in Impala for some historical reasons, it's not recommended to be used for any - // other types and will probably be deprecated in future Parquet format spec. That's the - // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which - // are both logical types annotating `INT64`. + // timestamp in Impala for some historical reasons. It's not recommended to be used for any + // other types and will probably be deprecated in some future version of parquet-format spec. + // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and + // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`. // // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive. Starting - // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store - // a timestamp into a `Long`. This design decision is subject to change though, for example, - // we may resort to microsecond precision in the future. + // from Spark 1.5.0, we resort to microsecond timestamp type. // - // For Parquet, we plan to write all `TimestampType` value as `TIMESTAMP_MICROS`, but it's - // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using) - // hasn't implemented `TIMESTAMP_MICROS` yet. + // We plan to write all `TimestampType` values as `TIMESTAMP_MICROS`, but up to writing, the + // most recent version of parquet-mr (1.8.1) hasn't implemented `TIMESTAMP_MICROS` yet. // - // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that. + // TODO Converts to `TIMESTAMP_MICROS` once parquet-mr implements that. case TimestampType => Types.primitive(INT96, repetition).named(field.name) case BinaryType => Types.primitive(BINARY, repetition).named(field.name) - // ====================== - // Decimals (legacy mode) - // ====================== - - // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and - // always store decimals in fixed-length byte arrays. To keep compatibility with these older - // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated - // by `DECIMAL`. - case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat => - Types - .primitive(FIXED_LEN_BYTE_ARRAY, repetition) - .as(DECIMAL) - .precision(precision) - .scale(scale) - .length(minBytesForPrecision(precision)) - .named(field.name) - - // ======================== - // Decimals (standard mode) - // ======================== - - // Uses INT32 for 1 <= precision <= 9 - case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat => - Types - .primitive(INT32, repetition) - .as(DECIMAL) - .precision(precision) - .scale(scale) - .named(field.name) - - // Uses INT64 for 10 <= precision <= 18 - case DecimalType.Fixed(precision, scale) - if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat => - Types - .primitive(INT64, repetition) - .as(DECIMAL) - .precision(precision) - .scale(scale) - .named(field.name) - - // Uses FIXED_LEN_BYTE_ARRAY for all other precisions - case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat => - Types - .primitive(FIXED_LEN_BYTE_ARRAY, repetition) - .as(DECIMAL) - .precision(precision) - .scale(scale) - .length(minBytesForPrecision(precision)) - .named(field.name) - - // =================================== - // ArrayType and MapType (legacy mode) - // =================================== - - // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level - // LIST structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro - // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous - // field name "array" is from parquet-avro. Note that this case is covered by the backwards- - // compatibility rules implemented in `isElementType()`. - case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat => - // group (LIST) { - // optional group bag { - // repeated array; - // } - // } - ConversionPatterns.listType( - repetition, - field.name, - Types - .buildGroup(REPEATED) - // "array" is the name chosen by Spark SQL 1.4.0 and prior versions - .addField(convertField(StructField("array", elementType, nullable))) - .named("bag")) - - // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level - // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is - // covered by the backwards-compatibility rules implemented in `isElementType()`. - case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat => - // group (LIST) { - // repeated array; - // } - ConversionPatterns.listType( - repetition, - field.name, - // "array" is the name chosen by parquet-avro (1.7.0 and prior version) - convertField(StructField("array", elementType, nullable), REPEATED)) - - // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by - // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. - case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat => - // group (MAP) { - // repeated group map (MAP_KEY_VALUE) { - // required key; - // value; - // } - // } - ConversionPatterns.mapType( - repetition, - field.name, - convertField(StructField("key", keyType, nullable = false)), - convertField(StructField("value", valueType, valueContainsNull))) - - // ===================================== - // ArrayType and MapType (standard mode) - // ===================================== - - case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat => - // group (LIST) { - // repeated group list { - // element; - // } - // } - Types - .buildGroup(repetition).as(LIST) - .addField( - Types.repeatedGroup() - .addField(convertField(StructField("element", elementType, containsNull))) - .named("list")) - .named(field.name) - - case MapType(keyType, valueType, valueContainsNull) => - // group (MAP) { - // repeated group key_value { - // required key; - // value; - // } - // } - Types - .buildGroup(repetition).as(MAP) - .addField( + case DecimalType.Fixed(precision, scale) => + val builder = writeLegacyParquetFormat match { + // Standard mode, 1 <= precision <= 9, converts to INT32 based DECIMAL + case false if precision <= MAX_PRECISION_FOR_INT32 => + Types.primitive(INT32, repetition) + + // Standard mode, 10 <= precision <= 18, converts to INT64 based DECIMAL + case false if precision <= MAX_PRECISION_FOR_INT64 => + Types.primitive(INT64, repetition) + + // All other cases: + // - Standard mode, 19 <= precision <= 38, converts to FIXED_LEN_BYTE_ARRAY based DECIMAL + // - Legacy mode, 1 <= precision <= 38, converts to FIXED_LEN_BYTE_ARRAY based DECIMAL + case _ => + val numBytes = minBytesForPrecision(precision) + Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(numBytes) + } + + builder.as(DECIMAL).precision(precision).scale(scale).named(field.name) + + case t: ArrayType => + val repeatedType = (writeLegacyParquetFormat, t.containsNull) match { + case (true, true) => + // Legacy mode: Spark 1.4.x and prior versions convert `ArrayType` with nullable + // elements into a 3-level `LIST` structure. This behavior is somewhat a hybrid of + // parquet-hive and parquet-avro (1.6.0rc3): the 3-level structure is similar to + // parquet-hive while the 3rd level anonymous field name "array" is from parquet-avro. + // + // group (LIST) { + // repeated group bag { | + // optional array; |- repeatedType + // } | + // } Types .repeatedGroup() - .addField(convertField(StructField("key", keyType, nullable = false))) - .addField(convertField(StructField("value", valueType, valueContainsNull))) - .named("key_value")) - .named(field.name) - - // =========== - // Other types - // =========== - - case StructType(fields) => - fields.foldLeft(Types.buildGroup(repetition)) { (builder, field) => - builder.addField(convertField(field)) - }.named(field.name) + .addField(convertField(StructField("array", t.elementType, t.containsNull))) + .named("bag") + + case (true, false) => + // Legacy mode: Spark 1.4.x and prior versions convert `ArrayType` with non-nullable + // elements into a 2-level `LIST` structure. This behavior mimics parquet-avro + // (1.6.0rc3). + // + // group (LIST) { + // repeated array; <- repeatedType + // } + convertField(StructField("array", t.elementType, t.containsNull), REPEATED) + + case (false, _) => + // Standard mode: + // + // group (LIST) { + // repeated group list { | + // element; |- repeatedType + // } | + // } + Types + .repeatedGroup() + .addField(convertField(StructField("element", t.elementType, t.containsNull))) + .named("list") + } + + Types.buildGroup(repetition).as(LIST).addField(repeatedType).named(field.name) + + case t: MapType => + val repeatedGroupBuilder = + Types + .repeatedGroup() + .addField(convertField(StructField("key", t.keyType, nullable = false))) + .addField(convertField(StructField("value", t.valueType, t.valueContainsNull))) + + val repeatedGroup = if (writeLegacyParquetFormat) { + // Legacy mode: Spark 1.4.x and prior versions convert MapType into a 3-level group + // annotated by MAP_KEY_VALUE. + // + // group (MAP) { + // repeated group map (MAP_KEY_VALUE) { | + // required key; |- repeatedGroup + // value; | + // } | + // } + repeatedGroupBuilder.as(MAP_KEY_VALUE).named("map") + } else { + // Standard mode: + // + // group (MAP) { + // repeated group key_value { | + // required key; |- repeatedGroup + // value; | + // } | + // } + repeatedGroupBuilder.named("key_value") + } + + Types.buildGroup(repetition).as(MAP).addField(repeatedGroup).named(field.name) + + case t: StructType => + val parquetFields = t.fields.map(convertField) + Types.buildGroup(repetition).addFields(parquetFields: _*).named(field.name) case udt: UserDefinedType[_] => convertField(field.copy(dataType = udt.sqlType)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala index 0e1c17a37e650..17e0aaa360fba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala @@ -243,18 +243,16 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit } writeLegacyParquetFormat match { - // Standard mode, writes decimals with precision <= 9 as INT32 + // Standard mode, 1 <= precision <= 9, writes as INT32 case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer - // Standard mode, writes decimals with precision <= 18 as INT64 + // Standard mode, 10 <= precision <= 18, writes as INT64 case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer - // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY + // Legacy mode, 1 <= precision <= 18, writes as FIXED_LEN_BYTE_ARRAY case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong - // All other cases: - // - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY - // - Legacy mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY + // Either standard or legacy mode, 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY case _ => binaryWriterUsingUnscaledBytes } }