From e9305bd0a79b3bb03f1bd9b646581951fd571cd6 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 26 Jul 2015 22:52:28 +0800
Subject: [PATCH 01/21] Refactors Parquet write support to follow Parquet
 format spec

---
 .../sql/parquet/CatalystReadSupport.scala     |   4 +-
 .../sql/parquet/CatalystWriteSupport.scala    | 307 +++++++++++++++++
 .../spark/sql/parquet/ParquetRelation.scala   |  32 +-
 .../sql/parquet/ParquetTableSupport.scala     | 322 ------------------
 4 files changed, 321 insertions(+), 344 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
index 975fec101d9c2..9648035744c1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
@@ -64,7 +64,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
     // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
     // schema of this file from its the metadata.
-    val maybeRowSchema = Option(conf.get(RowWriteSupport.SPARK_ROW_SCHEMA))
+    val maybeRowSchema = Option(conf.get(CatalystWriteSupport.SPARK_ROW_SCHEMA))
 
     // Optional schema of requested columns, in the form of a string serialized from a Catalyst
     // `StructType` containing all requested columns.
@@ -139,7 +139,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
     val metadata =
       Map.empty[String, String] ++
         maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
-        maybeRowSchema.map(RowWriteSupport.SPARK_ROW_SCHEMA -> _)
+        maybeRowSchema.map(CatalystWriteSupport.SPARK_ROW_SCHEMA -> _)
 
     logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
     new ReadContext(parquetRequestedSchema, metadata)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
new file mode 100644
index 0000000000000..8410ae26f8705
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.parquet
+
+import java.nio.{ByteBuffer, ByteOrder}
+
+import scala.collection.JavaConverters.mapAsJavaMapConverter
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.column.ParquetProperties
+import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.io.api.{Binary, RecordConsumer}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.types._
+
+private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
+  type ValueConsumer = (InternalRow, Int) => Unit
+
+  private var schema: StructType = _
+
+  private var recordConsumer: RecordConsumer = _
+
+  private var followParquetFormatSpec: Boolean = _
+
+  // Byte array used to write timestamps as Parquet INT96 values
+  private val timestampBuffer = new Array[Byte](12)
+
+  // Byte array used to write decimal values
+  private val decimalBuffer = new Array[Byte](8)
+
+  override def init(configuration: Configuration): WriteContext = {
+    val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
+    schema = StructType.fromString(schemaString)
+
+    assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null)
+    followParquetFormatSpec =
+      configuration.getBoolean(
+        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
+        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
+
+    val messageType = new CatalystSchemaConverter(configuration).convert(schema)
+    val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
+
+    logDebug(
+      s"""Initialized Parquet WriteSupport with Catalyst schema:
+         |${schema.prettyJson}
+         |and corresponding Parquet message type:
+         |$messageType
+       """.stripMargin)
+
+    new WriteContext(messageType, metadata)
+  }
+
+  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+    this.recordConsumer = recordConsumer
+  }
+
+  override def write(row: InternalRow): Unit = {
+    assert(row.numFields == schema.length)
+    recordConsumer.startMessage()
+    writeFields(row)
+    recordConsumer.endMessage()
+  }
+
+  private def writeFields(row: InternalRow): Unit = {
+    val consumers = schema.map(_.dataType).map(makeConsumer)
+    var i = 0
+
+    while (i < row.numFields) {
+      if (!row.isNullAt(i)) {
+        consumeField(schema(i).name, i) {
+          consumers(i).apply(row, i)
+        }
+      }
+
+      i += 1
+    }
+  }
+
+  private def makeConsumer(dataType: DataType): ValueConsumer = {
+    dataType match {
+      case BooleanType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addBoolean(row.getBoolean(ordinal))
+
+      case ByteType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addInteger(row.getByte(ordinal))
+
+      case ShortType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addInteger(row.getShort(ordinal))
+
+      case IntegerType | DateType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addInteger(row.getInt(ordinal))
+
+      case LongType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addLong(row.getLong(ordinal))
+
+      case FloatType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addFloat(row.getFloat(ordinal))
+
+      case DoubleType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addDouble(row.getDouble(ordinal))
+
+      case StringType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
+
+      case TimestampType =>
+        (row: InternalRow, ordinal: Int) => {
+          val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
+          val buf = ByteBuffer.wrap(timestampBuffer)
+          buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
+          recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer))
+        }
+
+      case BinaryType =>
+        (row: InternalRow, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
+
+      case DecimalType.Unlimited =>
+        sys.error(s"Unsupported data type $dataType. Decimal precision must be specified.")
+
+      case DecimalType.Fixed(precision, _) if precision > 18 =>
+        sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.")
+
+      case DecimalType.Fixed(precision) =>
+        (row: InternalRow, ordinal: Int) => {
+          val decimal = row.getDecimal(ordinal)
+          val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision)
+          val unscaledLong = decimal.toUnscaledLong
+
+          var i = 0
+          var shift = 8 * (numBytes - 1)
+
+          while (i < numBytes) {
+            decimalBuffer(i) = (unscaledLong >> shift).toByte
+            i += 1
+            shift -= 8
+          }
+
+          recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+        }
+
+      case StructType(fields) =>
+        (row: InternalRow, ordinal: Int) =>
+          consumeGroup(writeFields(row.getStruct(ordinal, fields.length)))
+
+      case arrayType: ArrayType if followParquetFormatSpec =>
+        makeStandardArrayConsumer(arrayType.elementType)
+
+      case arrayType: ArrayType if !followParquetFormatSpec =>
+        makeLegacyArrayConsumer(arrayType.elementType, arrayType.containsNull)
+
+      case mapType: MapType if followParquetFormatSpec =>
+        makeMapConsumer(mapType.keyType, mapType.valueType, "key_value")
+
+      case mapType: MapType if !followParquetFormatSpec =>
+        makeMapConsumer(mapType.keyType, mapType.valueType, "map")
+
+      case _ =>
+        sys.error(s"Unsupported data type $dataType.")
+    }
+  }
+
+  private def makeStandardArrayConsumer(elementType: DataType): ValueConsumer = {
+    makeThreeLevelArrayConsumer(elementType, "list", "element")
+  }
+
+  private def makeLegacyArrayConsumer(
+      elementType: DataType,
+      containsNull: Boolean): ValueConsumer = {
+    if (containsNull) {
+      makeThreeLevelArrayConsumer(elementType, "bag", "array")
+    } else {
+      makeTwoLevelArrayConsumer(elementType, "array")
+    }
+  }
+
+  private def makeThreeLevelArrayConsumer(
+      elementType: DataType,
+      repeatedGroupName: String,
+      elementFieldName: String): ValueConsumer = {
+    val elementConsumer = makeConsumer(elementType)
+    val mutableRow = new SpecificMutableRow(elementType :: Nil)
+
+    (row: InternalRow, ordinal: Int) => {
+      consumeGroup {
+        consumeField(repeatedGroupName, 0) {
+          val array = row.get(ordinal).asInstanceOf[Array[_]]
+          var i = 0
+
+          while (i < array.length) {
+            consumeGroup {
+              if (array(i) != null) {
+                mutableRow.update(0, array(i))
+                consumeField(elementFieldName, 0)(elementConsumer.apply(mutableRow, 0))
+              }
+            }
+
+            i += 1
+          }
+        }
+      }
+    }
+  }
+
+  private def makeTwoLevelArrayConsumer(
+      elementType: DataType,
+      repeatedFieldName: String): ValueConsumer = {
+    val elementConsumer = makeConsumer(elementType)
+    val mutableRow = new SpecificMutableRow(elementType :: Nil)
+
+    (row: InternalRow, ordinal: Int) => {
+      consumeGroup {
+        consumeField(repeatedFieldName, 0) {
+          val array = row.get(ordinal).asInstanceOf[Array[_]]
+          var i = 0
+
+          while (i < array.length) {
+            mutableRow.update(0, array(i))
+            elementConsumer.apply(mutableRow, 0)
+            i += 1
+          }
+        }
+      }
+    }
+  }
+
+  private def makeMapConsumer(
+      keyType: DataType,
+      valueType: DataType,
+      repeatedGroupName: String): ValueConsumer = {
+    val keyConsumer = makeConsumer(keyType)
+    val valueConsumer = makeConsumer(valueType)
+    val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
+
+    (row: InternalRow, ordinal: Int) => {
+      consumeGroup {
+        consumeField(repeatedGroupName, 0) {
+          val map = row.get(ordinal).asInstanceOf[Map[_, _]]
+          for ((key, value) <- map) {
+            consumeGroup {
+              mutableRow.update(0, key)
+              consumeField("key", 0)(keyConsumer.apply(mutableRow, 0))
+              if (value != null) {
+                mutableRow.update(1, value)
+                consumeField("value", 1)(valueConsumer.apply(mutableRow, 1))
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def consumeGroup(f: => Unit): Unit = {
+    recordConsumer.startGroup()
+    f
+    recordConsumer.endGroup()
+  }
+
+  private def consumeField(field: String, index: Int)(f: => Unit): Unit = {
+    recordConsumer.startField(field, index)
+    f
+    recordConsumer.endField(field, index)
+  }
+}
+
+private[parquet] object CatalystWriteSupport {
+  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
+
+  def setSchema(schema: StructType, configuration: Configuration): Unit = {
+    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
+    configuration.set(SPARK_ROW_SCHEMA, schema.json)
+    configuration.set(
+      ParquetOutputFormat.WRITER_VERSION,
+      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index b4337a48dbd80..2d98792385664 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -37,16 +37,17 @@ import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetRecordReader, _
 import org.apache.parquet.schema.MessageType
 import org.apache.parquet.{Log => ParquetLog}
 
-import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD}
 import org.apache.spark.rdd.RDD._
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources.PartitionSpec
+import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
+import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
 
 
 private[sql] class DefaultSource extends HadoopFsRelationProvider {
@@ -228,18 +229,13 @@ private[sql] class ParquetRelation(
     // bundled with `ParquetOutputFormat[Row]`.
     job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
 
-    // TODO There's no need to use two kinds of WriteSupport
-    // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and
-    // complex types.
-    val writeSupportClass =
-      if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
-        classOf[MutableRowWriteSupport]
-      } else {
-        classOf[RowWriteSupport]
-      }
+    ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport])
+    CatalystWriteSupport.setSchema(dataSchema, conf)
 
-    ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass)
-    RowWriteSupport.setSchema(dataSchema.toAttributes, conf)
+    // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema)
+    conf.set(
+      SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
+      sqlContext.conf.followParquetFormatSpec.toString)
 
     // Sets compression scheme
     conf.set(
@@ -267,7 +263,6 @@ private[sql] class ParquetRelation(
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
 
     // Create the function to set variable Parquet confs at both driver and executor side.
     val initLocalJobFuncOpt =
@@ -278,8 +273,7 @@ private[sql] class ParquetRelation(
         useMetadataCache,
         parquetFilterPushDown,
         assumeBinaryIsString,
-        assumeInt96IsTimestamp,
-        followParquetFormatSpec) _
+        assumeInt96IsTimestamp) _
 
     // Create the function to set input paths at the driver side.
     val setInputPaths = ParquetRelation.initializeDriverSideJobFunc(inputFiles) _
@@ -479,8 +473,7 @@ private[sql] object ParquetRelation extends Logging {
       useMetadataCache: Boolean,
       parquetFilterPushDown: Boolean,
       assumeBinaryIsString: Boolean,
-      assumeInt96IsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean)(job: Job): Unit = {
+      assumeInt96IsTimestamp: Boolean)(job: Job): Unit = {
     val conf = job.getConfiguration
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
@@ -501,16 +494,15 @@ private[sql] object ParquetRelation extends Logging {
     })
 
     conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
+      CatalystWriteSupport.SPARK_ROW_SCHEMA,
       CatalystSchemaConverter.checkFieldNames(dataSchema).json)
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
 
-    // Sets flags for Parquet schema conversion
+    // Sets flags for Parquet schema converter (converting Parquet schema to Catalyst schema)
     conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
     conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
-    conf.setBoolean(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key, followParquetFormatSpec)
   }
 
   /** This closure sets input paths at the driver side. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
deleted file mode 100644
index 9cd0250f9c510..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import java.math.BigInteger
-import java.nio.{ByteBuffer, ByteOrder}
-import java.util.{HashMap => JHashMap}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.parquet.column.ParquetProperties
-import org.apache.parquet.hadoop.ParquetOutputFormat
-import org.apache.parquet.hadoop.api.WriteSupport
-import org.apache.parquet.io.api._
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * A `parquet.hadoop.api.WriteSupport` for Row objects.
- */
-private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Logging {
-
-  private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Array[Attribute] = null
-
-  override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    val metadata = new JHashMap[String, String]()
-    metadata.put(CatalystReadSupport.SPARK_METADATA_KEY, origAttributesStr)
-
-    if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
-    }
-
-    log.debug(s"write support initialized for requested schema $attributes")
-    ParquetRelation.enableLogForwarding()
-    new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata)
-  }
-
-  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
-    writer = recordConsumer
-    log.debug(s"preparing for write with schema $attributes")
-  }
-
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        writeValue(attributes(index).dataType, record.get(index, attributes(index).dataType))
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private[parquet] def writeValue(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case t: UserDefinedType[_] => writeValue(t.sqlType, value)
-        case t @ ArrayType(_, _) => writeArray(
-          t,
-          value.asInstanceOf[CatalystConverter.ArrayScalaType])
-        case t @ MapType(_, _, _) => writeMap(
-          t,
-          value.asInstanceOf[CatalystConverter.MapScalaType])
-        case t @ StructType(_) => writeStruct(
-          t,
-          value.asInstanceOf[CatalystConverter.StructScalaType])
-        case _ => writePrimitive(schema.asInstanceOf[AtomicType], value)
-      }
-    }
-  }
-
-  private[parquet] def writePrimitive(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean])
-        case ByteType => writer.addInteger(value.asInstanceOf[Byte])
-        case ShortType => writer.addInteger(value.asInstanceOf[Short])
-        case IntegerType | DateType => writer.addInteger(value.asInstanceOf[Int])
-        case LongType => writer.addLong(value.asInstanceOf[Long])
-        case TimestampType => writeTimestamp(value.asInstanceOf[Long])
-        case FloatType => writer.addFloat(value.asInstanceOf[Float])
-        case DoubleType => writer.addDouble(value.asInstanceOf[Double])
-        case StringType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[UTF8String].getBytes))
-        case BinaryType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
-        case DecimalType.Fixed(precision, _) =>
-          writeDecimal(value.asInstanceOf[Decimal], precision)
-        case _ => sys.error(s"Do not know how to writer $schema to consumer")
-      }
-    }
-  }
-
-  private[parquet] def writeStruct(
-      schema: StructType,
-      struct: CatalystConverter.StructScalaType): Unit = {
-    if (struct != null) {
-      val fields = schema.fields.toArray
-      writer.startGroup()
-      var i = 0
-      while(i < fields.length) {
-        if (!struct.isNullAt(i)) {
-          writer.startField(fields(i).name, i)
-          writeValue(fields(i).dataType, struct.get(i, fields(i).dataType))
-          writer.endField(fields(i).name, i)
-        }
-        i = i + 1
-      }
-      writer.endGroup()
-    }
-  }
-
-  private[parquet] def writeArray(
-      schema: ArrayType,
-      array: CatalystConverter.ArrayScalaType): Unit = {
-    val elementType = schema.elementType
-    writer.startGroup()
-    if (array.numElements() > 0) {
-      if (schema.containsNull) {
-        writer.startField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writer.startGroup()
-          if (!array.isNullAt(i)) {
-            writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-            writeValue(elementType, array.get(i, elementType))
-            writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-          }
-          writer.endGroup()
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-      } else {
-        writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writeValue(elementType, array.get(i, elementType))
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-      }
-    }
-    writer.endGroup()
-  }
-
-  private[parquet] def writeMap(
-      schema: MapType,
-      map: CatalystConverter.MapScalaType): Unit = {
-    writer.startGroup()
-    val length = map.numElements()
-    if (length > 0) {
-      writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-      map.foreach(schema.keyType, schema.valueType, (key, value) => {
-        writer.startGroup()
-        writer.startField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        writeValue(schema.keyType, key)
-        writer.endField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        if (value != null) {
-          writer.startField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-          writeValue(schema.valueType, value)
-          writer.endField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-        }
-        writer.endGroup()
-      })
-      writer.endField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-    }
-    writer.endGroup()
-  }
-
-  // Scratch array used to write decimals as fixed-length byte array
-  private[this] var reusableDecimalBytes = new Array[Byte](16)
-
-  private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = {
-    val numBytes = CatalystSchemaConverter.minBytesForPrecision(precision)
-
-    def longToBinary(unscaled: Long): Binary = {
-      var i = 0
-      var shift = 8 * (numBytes - 1)
-      while (i < numBytes) {
-        reusableDecimalBytes(i) = (unscaled >> shift).toByte
-        i += 1
-        shift -= 8
-      }
-      Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-    }
-
-    def bigIntegerToBinary(unscaled: BigInteger): Binary = {
-      unscaled.toByteArray match {
-        case bytes if bytes.length == numBytes =>
-          Binary.fromByteArray(bytes)
-
-        case bytes if bytes.length <= reusableDecimalBytes.length =>
-          val signedByte = (if (bytes.head < 0) -1 else 0).toByte
-          java.util.Arrays.fill(reusableDecimalBytes, 0, numBytes - bytes.length, signedByte)
-          System.arraycopy(bytes, 0, reusableDecimalBytes, numBytes - bytes.length, bytes.length)
-          Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-
-        case bytes =>
-          reusableDecimalBytes = new Array[Byte](bytes.length)
-          bigIntegerToBinary(unscaled)
-      }
-    }
-
-    val binary = if (numBytes <= 8) {
-      longToBinary(decimal.toUnscaledLong)
-    } else {
-      bigIntegerToBinary(decimal.toJavaBigDecimal.unscaledValue())
-    }
-
-    writer.addBinary(binary)
-  }
-
-  // array used to write Timestamp as Int96 (fixed-length binary)
-  private[this] val int96buf = new Array[Byte](12)
-
-  private[parquet] def writeTimestamp(ts: Long): Unit = {
-    val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(ts)
-    val buf = ByteBuffer.wrap(int96buf)
-    buf.order(ByteOrder.LITTLE_ENDIAN)
-    buf.putLong(timeOfDayNanos)
-    buf.putInt(julianDay)
-    writer.addBinary(Binary.fromByteArray(int96buf))
-  }
-}
-
-// Optimized for non-nested rows
-private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index) && !record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        consumeType(attributes(index).dataType, record, index)
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private def consumeType(
-      ctype: DataType,
-      record: InternalRow,
-      index: Int): Unit = {
-    ctype match {
-      case BooleanType => writer.addBoolean(record.getBoolean(index))
-      case ByteType => writer.addInteger(record.getByte(index))
-      case ShortType => writer.addInteger(record.getShort(index))
-      case IntegerType | DateType => writer.addInteger(record.getInt(index))
-      case LongType => writer.addLong(record.getLong(index))
-      case TimestampType => writeTimestamp(record.getLong(index))
-      case FloatType => writer.addFloat(record.getFloat(index))
-      case DoubleType => writer.addDouble(record.getDouble(index))
-      case StringType =>
-        writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
-      case BinaryType =>
-        writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
-      case DecimalType.Fixed(precision, scale) =>
-        writeDecimal(record.getDecimal(index, precision, scale), precision)
-      case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
-    }
-  }
-}
-
-private[parquet] object RowWriteSupport {
-  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
-
-  def getSchema(configuration: Configuration): Seq[Attribute] = {
-    val schemaString = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    if (schemaString == null) {
-      throw new RuntimeException("Missing schema!")
-    }
-    ParquetTypesConverter.convertFromString(schemaString)
-  }
-
-  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
-    val encoded = ParquetTypesConverter.convertToString(schema)
-    configuration.set(SPARK_ROW_SCHEMA, encoded)
-    configuration.set(
-      ParquetOutputFormat.WRITER_VERSION,
-      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
-  }
-}

From b465661b09cd34581babddeccafb4ea887777d34 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 01:36:09 +0800
Subject: [PATCH 02/21] Cleans up code only used by old RowWriteSupport

---
 .../sql/parquet/CatalystSchemaConverter.scala |   2 +-
 .../sql/parquet/CatalystWriteSupport.scala    |  15 +-
 .../spark/sql/parquet/ParquetConverter.scala  |  39 -----
 .../spark/sql/parquet/ParquetTypes.scala      | 159 ------------------
 .../parquet/ParquetCompatibilityTest.scala    |   1 +
 .../spark/sql/parquet/ParquetIOSuite.scala    |  11 +-
 .../sql/parquet/ParquetSchemaSuite.scala      |   4 +-
 .../spark/sql/parquet/ParquetTest.scala       |  35 ++++
 8 files changed, 57 insertions(+), 209 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index d43ca95b4eea0..41b3c9d73e0af 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -453,7 +453,7 @@ private[parquet] class CatalystSchemaConverter(
             .buildGroup(REPEATED)
             // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
             .addField(convertField(StructField("array_element", elementType, nullable)))
-            .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME))
+            .named("bag"))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 8410ae26f8705..91768e5b59bcb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -151,10 +151,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
       case DecimalType.Fixed(precision, _) if precision > 18 =>
         sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.")
 
-      case DecimalType.Fixed(precision) =>
+      case DecimalType.Fixed(precision, _) =>
         (row: InternalRow, ordinal: Int) => {
           val decimal = row.getDecimal(ordinal)
-          val numBytes = ParquetTypesConverter.BYTES_FOR_PRECISION(precision)
+          val numBytes = CatalystWriteSupport.BYTES_FOR_PRECISION(precision)
           val unscaledLong = decimal.toUnscaledLong
 
           var i = 0
@@ -304,4 +304,15 @@ private[parquet] object CatalystWriteSupport {
       ParquetOutputFormat.WRITER_VERSION,
       ParquetProperties.WriterVersion.PARQUET_1_0.toString)
   }
+
+  /**
+   * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision.
+   */
+  private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision =>
+    var length = 1
+    while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) {
+      length += 1
+    }
+    length
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
deleted file mode 100644
index 6ed3580af0729..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.{MapData, ArrayData}
-
-// TODO Removes this while fixing SPARK-8848
-private[sql] object CatalystConverter {
-  // This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
-  // Note that "array" for the array elements is chosen by ParquetAvro.
-  // Using a different value will result in Parquet silently dropping columns.
-  val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag"
-  val ARRAY_ELEMENTS_SCHEMA_NAME = "array"
-
-  val MAP_KEY_SCHEMA_NAME = "key"
-  val MAP_VALUE_SCHEMA_NAME = "value"
-  val MAP_SCHEMA_NAME = "map"
-
-  // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
-  type ArrayScalaType = ArrayData
-  type StructScalaType = InternalRow
-  type MapScalaType = MapData
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
deleted file mode 100644
index 3854f5bd39fb1..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.parquet
-
-import java.io.IOException
-
-import scala.collection.JavaConversions._
-import scala.util.Try
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.mapreduce.Job
-import org.apache.parquet.format.converter.ParquetMetadataConverter
-import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.types._
-
-
-private[parquet] object ParquetTypesConverter extends Logging {
-  def isPrimitiveType(ctype: DataType): Boolean = ctype match {
-    case _: NumericType | BooleanType | DateType | TimestampType | StringType | BinaryType => true
-    case _ => false
-  }
-
-  /**
-   * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision.
-   */
-  private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision =>
-    var length = 1
-    while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) {
-      length += 1
-    }
-    length
-  }
-
-  def convertFromAttributes(attributes: Seq[Attribute]): MessageType = {
-    val converter = new CatalystSchemaConverter()
-    converter.convert(StructType.fromAttributes(attributes))
-  }
-
-  def convertFromString(string: String): Seq[Attribute] = {
-    Try(DataType.fromJson(string)).getOrElse(DataType.fromCaseClassString(string)) match {
-      case s: StructType => s.toAttributes
-      case other => sys.error(s"Can convert $string to row")
-    }
-  }
-
-  def convertToString(schema: Seq[Attribute]): String = {
-    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
-    StructType.fromAttributes(schema).json
-  }
-
-  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration): Unit = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
-    }
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to write Parquet metadata: path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && !fs.getFileStatus(path).isDir) {
-      throw new IllegalArgumentException(s"Expected to write to directory $path but found file")
-    }
-    val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
-    if (fs.exists(metadataPath)) {
-      try {
-        fs.delete(metadataPath, true)
-      } catch {
-        case e: IOException =>
-          throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE at $metadataPath")
-      }
-    }
-    val extraMetadata = new java.util.HashMap[String, String]()
-    extraMetadata.put(
-      CatalystReadSupport.SPARK_METADATA_KEY,
-      ParquetTypesConverter.convertToString(attributes))
-    // TODO: add extra data, e.g., table name, date, etc.?
-
-    val parquetSchema: MessageType = ParquetTypesConverter.convertFromAttributes(attributes)
-    val metaData: FileMetaData = new FileMetaData(
-      parquetSchema,
-      extraMetadata,
-      "Spark")
-
-    ParquetRelation.enableLogForwarding()
-    ParquetFileWriter.writeMetadataFile(
-      conf,
-      path,
-      new Footer(path, new ParquetMetadata(metaData, Nil)) :: Nil)
-  }
-
-  /**
-   * Try to read Parquet metadata at the given Path. We first see if there is a summary file
-   * in the parent directory. If so, this is used. Else we read the actual footer at the given
-   * location.
-   * @param origPath The path at which we expect one (or more) Parquet files.
-   * @param configuration The Hadoop configuration to use.
-   * @return The `ParquetMetadata` containing among other things the schema.
-   */
-  def readMetaData(origPath: Path, configuration: Option[Configuration]): ParquetMetadata = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to read Parquet metadata: path is null")
-    }
-    val job = new Job()
-    val conf = configuration.getOrElse(ContextUtil.getConfiguration(job))
-    val fs: FileSystem = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath")
-    }
-    val path = origPath.makeQualified(fs)
-
-    val children =
-      fs
-        .globStatus(path)
-        .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
-        .filterNot { status =>
-          val name = status.getPath.getName
-          (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
-        }
-
-    ParquetRelation.enableLogForwarding()
-
-    // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row
-    // groups. Since Parquet schema is replicated among all row groups, we only need to touch a
-    // single row group to read schema related metadata. Notice that we are making assumptions that
-    // all data in a single Parquet file have the same schema, which is normally true.
-    children
-      // Try any non-"_metadata" file first...
-      .find(_.getPath.getName != ParquetFileWriter.PARQUET_METADATA_FILE)
-      // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is
-      // empty, thus normally the "_metadata" file is expected to be fairly small).
-      .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE))
-      .map(ParquetFileReader.readFooter(conf, _, ParquetMetadataConverter.NO_FILTER))
-      .getOrElse(
-        throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
index b4cdfd9e98f6f..0238f0f0a9a1a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetCompatibilityTest.scala
@@ -16,6 +16,7 @@
  */
 
 package org.apache.spark.sql.parquet
+
 import java.io.File
 
 import scala.collection.JavaConversions._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index b415da5b8c136..5a54cf75e6a3f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -204,8 +204,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
 
   test("compression codec") {
     def compressionCodecFor(path: String): String = {
-      val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(configuration))
+      val codecs = readMetadata(new Path(path), configuration)
         .getBlocks
         .flatMap(_.getColumns)
         .map(_.getCodec.name())
@@ -277,15 +276,15 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
     withTempPath { file =>
       val path = new Path(file.toURI.toString)
       val fs = FileSystem.getLocal(configuration)
-      val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+      val schema = StructType.fromAttributes(ScalaReflection.attributesFor[(Int, String)])
+      writeMetadata(schema, path, configuration)
 
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+      val metaData = readMetadata(path, configuration)
       val actualSchema = metaData.getFileMetaData.getSchema
-      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+      val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema)
 
       actualSchema.checkContains(expectedSchema)
       expectedSchema.checkContains(actualSchema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 4a0b3b60f419d..7deba1cf6ebfc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -361,8 +361,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     val jsonString = """{"type":"struct","fields":[{"name":"c1","type":"integer","nullable":false,"metadata":{}},{"name":"c2","type":"binary","nullable":true,"metadata":{}}]}"""
     // scalastyle:on
 
-    val fromCaseClassString = ParquetTypesConverter.convertFromString(caseClassString)
-    val fromJson = ParquetTypesConverter.convertFromString(jsonString)
+    val fromCaseClassString = StructType.fromString(caseClassString)
+    val fromJson = StructType.fromString(jsonString)
 
     (fromCaseClassString, fromJson).zipped.foreach { (a, b) =>
       assert(a.name == b.name)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 64e94056f209a..c1c7ca9ae5821 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -19,11 +19,20 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 
+import scala.collection.JavaConverters.{mapAsJavaMapConverter, seqAsJavaListConverter}
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.format.converter.ParquetMetadataConverter
+import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
+
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SaveMode}
 
 /**
@@ -97,4 +106,30 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite =>
     assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
     partDir
   }
+
+  def writeMetadata(schema: StructType, path: Path, configuration: Configuration): Unit = {
+    val parquetSchema = new CatalystSchemaConverter(configuration).convert(schema)
+    val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava
+    val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}"
+    val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy)
+    val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava)
+    val footer = new Footer(path, parquetMetadata)
+    ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava)
+  }
+
+
+  def readMetadata(path: Path, configuration: Configuration): ParquetMetadata = {
+    val summaryFileNames = Seq(
+      ParquetFileWriter.PARQUET_METADATA_FILE,
+      ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
+
+    val fs = path.getFileSystem(configuration)
+    val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, path).filter { f =>
+      val name = f.getPath.getName
+      name.startsWith(".") && name.startsWith("_") || summaryFileNames.contains(name)
+    }
+
+    ParquetFileReader.readFooter(
+      configuration, leaves.head, ParquetMetadataConverter.SKIP_ROW_GROUPS)
+  }
 }

From 821e9ec25b755b4f46d715cbee465f2cb4432afb Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 19:43:14 +0800
Subject: [PATCH 03/21] Fixes test failures

---
 .../sql/parquet/CatalystWriteSupport.scala    | 24 +++++++++----------
 .../DirectParquetOutputCommitter.scala        |  2 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    | 23 +++++++++---------
 .../spark/sql/parquet/ParquetTest.scala       | 22 +++++++----------
 4 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 91768e5b59bcb..ca613eb47d871 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -78,13 +78,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 
   override def write(row: InternalRow): Unit = {
-    assert(row.numFields == schema.length)
-    recordConsumer.startMessage()
-    writeFields(row)
-    recordConsumer.endMessage()
+    consumeMessage(writeFields(row, schema))
   }
 
-  private def writeFields(row: InternalRow): Unit = {
+  private def writeFields(row: InternalRow, schema: StructType): Unit = {
     val consumers = schema.map(_.dataType).map(makeConsumer)
     var i = 0
 
@@ -145,9 +142,6 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
         (row: InternalRow, ordinal: Int) =>
           recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
 
-      case DecimalType.Unlimited =>
-        sys.error(s"Unsupported data type $dataType. Decimal precision must be specified.")
-
       case DecimalType.Fixed(precision, _) if precision > 18 =>
         sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.")
 
@@ -169,9 +163,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
         }
 
-      case StructType(fields) =>
+      case structType @ StructType(fields) =>
         (row: InternalRow, ordinal: Int) =>
-          consumeGroup(writeFields(row.getStruct(ordinal, fields.length)))
+          consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType))
 
       case arrayType: ArrayType if followParquetFormatSpec =>
         makeStandardArrayConsumer(arrayType.elementType)
@@ -214,7 +208,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
         consumeField(repeatedGroupName, 0) {
-          val array = row.get(ordinal).asInstanceOf[Array[_]]
+          val array = row.get(ordinal).asInstanceOf[Seq[_]]
           var i = 0
 
           while (i < array.length) {
@@ -241,7 +235,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
         consumeField(repeatedFieldName, 0) {
-          val array = row.get(ordinal).asInstanceOf[Array[_]]
+          val array = row.get(ordinal).asInstanceOf[Seq[_]]
           var i = 0
 
           while (i < array.length) {
@@ -281,6 +275,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
+  private def consumeMessage(f: => Unit): Unit = {
+    recordConsumer.startMessage()
+    f
+    recordConsumer.endMessage()
+  }
+
   private def consumeGroup(f: => Unit): Unit = {
     recordConsumer.startGroup()
     f
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
index 1551afd7b7bf2..46cfa9dc0a0bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/DirectParquetOutputCommitter.scala
@@ -39,7 +39,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetO
  *
  *   NEVER use [[DirectParquetOutputCommitter]] when appending data, because currently there's
  *   no safe way undo a failed appending job (that's why both `abortTask()` and `abortJob()` are
- *   left * empty).
+ *   left empty).
  */
 private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
   extends ParquetOutputCommitter(outputPath, context) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 5a54cf75e6a3f..2ea64c715b9a3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -29,7 +29,7 @@ import org.apache.parquet.example.data.{Group, GroupWriter}
 import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
 import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
+import org.apache.parquet.hadoop._
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
@@ -203,14 +203,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
   }
 
   test("compression codec") {
-    def compressionCodecFor(path: String): String = {
-      val codecs = readMetadata(new Path(path), configuration)
-        .getBlocks
-        .flatMap(_.getColumns)
-        .map(_.getCodec.name())
-        .distinct
-
-      assert(codecs.size === 1)
+    def compressionCodecFor(path: String, codecName: String): String = {
+      val codecs = for {
+        footer <- readAllFootersWithoutSummaryFiles(new Path(path), configuration)
+        block <- footer.getParquetMetadata.getBlocks
+        column <- block.getColumns
+      } yield column.getCodec.name()
+
+      assert(codecs.distinct === Seq(codecName))
       codecs.head
     }
 
@@ -220,7 +220,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
       withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> codec.name()) {
         withParquetFile(data) { path =>
           assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) {
-            compressionCodecFor(path)
+            compressionCodecFor(path, codec.name())
           }
         }
       }
@@ -282,9 +282,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = readMetadata(path, configuration)
-      val actualSchema = metaData.getFileMetaData.getSchema
       val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema)
+      val actualSchema = readFooter(path, configuration).getFileMetaData.getSchema
 
       actualSchema.checkContains(expectedSchema)
       expectedSchema.checkContains(actualSchema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index c1c7ca9ae5821..f5dc9051a2cc2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.parquet
 
 import java.io.File
 
-import scala.collection.JavaConverters.{mapAsJavaMapConverter, seqAsJavaListConverter}
+import scala.collection.JavaConverters.{iterableAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter}
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
@@ -30,7 +30,6 @@ import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetM
 import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SaveMode}
@@ -117,19 +116,16 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite =>
     ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava)
   }
 
-
-  def readMetadata(path: Path, configuration: Configuration): ParquetMetadata = {
-    val summaryFileNames = Seq(
-      ParquetFileWriter.PARQUET_METADATA_FILE,
-      ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
-
+  def readAllFootersWithoutSummaryFiles(
+      path: Path, configuration: Configuration): Seq[Footer] = {
     val fs = path.getFileSystem(configuration)
-    val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, path).filter { f =>
-      val name = f.getPath.getName
-      name.startsWith(".") && name.startsWith("_") || summaryFileNames.contains(name)
-    }
+    ParquetFileReader.readAllFootersInParallel(configuration, fs.getFileStatus(path)).asScala.toSeq
+  }
 
+  def readFooter(path: Path, configuration: Configuration): ParquetMetadata = {
     ParquetFileReader.readFooter(
-      configuration, leaves.head, ParquetMetadataConverter.SKIP_ROW_GROUPS)
+      configuration,
+      new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE),
+      ParquetMetadataConverter.NO_FILTER)
   }
 }

From 2a1e884d003d1fce3b439f219500dc375cb2d1b7 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 20:43:28 +0800
Subject: [PATCH 04/21] Fixes writing UDT

---
 .../sql/parquet/CatalystWriteSupport.scala    | 59 ++++++++++---------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index ca613eb47d871..555b29e3d9289 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 
 private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
-  type ValueConsumer = (InternalRow, Int) => Unit
+  type ValueWriter = (InternalRow, Int) => Unit
 
   private var schema: StructType = _
 
@@ -82,13 +82,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 
   private def writeFields(row: InternalRow, schema: StructType): Unit = {
-    val consumers = schema.map(_.dataType).map(makeConsumer)
+    val writers = schema.map(_.dataType).map(makeWriter)
     var i = 0
 
     while (i < row.numFields) {
       if (!row.isNullAt(i)) {
         consumeField(schema(i).name, i) {
-          consumers(i).apply(row, i)
+          writers(i).apply(row, i)
         }
       }
 
@@ -96,7 +96,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeConsumer(dataType: DataType): ValueConsumer = {
+  private def makeWriter(dataType: DataType): ValueWriter = {
     dataType match {
       case BooleanType =>
         (row: InternalRow, ordinal: Int) =>
@@ -168,41 +168,44 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType))
 
       case arrayType: ArrayType if followParquetFormatSpec =>
-        makeStandardArrayConsumer(arrayType.elementType)
+        makeStandardArrayWriter(arrayType.elementType)
 
       case arrayType: ArrayType if !followParquetFormatSpec =>
-        makeLegacyArrayConsumer(arrayType.elementType, arrayType.containsNull)
+        makeLegacyArrayWriter(arrayType.elementType, arrayType.containsNull)
 
       case mapType: MapType if followParquetFormatSpec =>
-        makeMapConsumer(mapType.keyType, mapType.valueType, "key_value")
+        makeMapWriter(mapType.keyType, mapType.valueType, "key_value")
 
       case mapType: MapType if !followParquetFormatSpec =>
-        makeMapConsumer(mapType.keyType, mapType.valueType, "map")
+        makeMapWriter(mapType.keyType, mapType.valueType, "map")
+
+      case udt: UserDefinedType[_] =>
+        makeWriter(udt.sqlType)
 
       case _ =>
         sys.error(s"Unsupported data type $dataType.")
     }
   }
 
-  private def makeStandardArrayConsumer(elementType: DataType): ValueConsumer = {
-    makeThreeLevelArrayConsumer(elementType, "list", "element")
+  private def makeStandardArrayWriter(elementType: DataType): ValueWriter = {
+    makeThreeLevelArrayWriter(elementType, "list", "element")
   }
 
-  private def makeLegacyArrayConsumer(
+  private def makeLegacyArrayWriter(
       elementType: DataType,
-      containsNull: Boolean): ValueConsumer = {
+      containsNull: Boolean): ValueWriter = {
     if (containsNull) {
-      makeThreeLevelArrayConsumer(elementType, "bag", "array")
+      makeThreeLevelArrayWriter(elementType, "bag", "array")
     } else {
-      makeTwoLevelArrayConsumer(elementType, "array")
+      makeTwoLevelArrayWriter(elementType, "array")
     }
   }
 
-  private def makeThreeLevelArrayConsumer(
+  private def makeThreeLevelArrayWriter(
       elementType: DataType,
       repeatedGroupName: String,
-      elementFieldName: String): ValueConsumer = {
-    val elementConsumer = makeConsumer(elementType)
+      elementFieldName: String): ValueWriter = {
+    val elementWriter = makeWriter(elementType)
     val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
@@ -215,7 +218,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
             consumeGroup {
               if (array(i) != null) {
                 mutableRow.update(0, array(i))
-                consumeField(elementFieldName, 0)(elementConsumer.apply(mutableRow, 0))
+                consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
               }
             }
 
@@ -226,10 +229,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeTwoLevelArrayConsumer(
+  private def makeTwoLevelArrayWriter(
       elementType: DataType,
-      repeatedFieldName: String): ValueConsumer = {
-    val elementConsumer = makeConsumer(elementType)
+      repeatedFieldName: String): ValueWriter = {
+    val elementWriter = makeWriter(elementType)
     val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
@@ -240,7 +243,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
           while (i < array.length) {
             mutableRow.update(0, array(i))
-            elementConsumer.apply(mutableRow, 0)
+            elementWriter.apply(mutableRow, 0)
             i += 1
           }
         }
@@ -248,12 +251,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeMapConsumer(
+  private def makeMapWriter(
       keyType: DataType,
       valueType: DataType,
-      repeatedGroupName: String): ValueConsumer = {
-    val keyConsumer = makeConsumer(keyType)
-    val valueConsumer = makeConsumer(valueType)
+      repeatedGroupName: String): ValueWriter = {
+    val keyWriter = makeWriter(keyType)
+    val valueWriter = makeWriter(valueType)
     val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
@@ -263,10 +266,10 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           for ((key, value) <- map) {
             consumeGroup {
               mutableRow.update(0, key)
-              consumeField("key", 0)(keyConsumer.apply(mutableRow, 0))
+              consumeField("key", 0)(keyWriter.apply(mutableRow, 0))
               if (value != null) {
                 mutableRow.update(1, value)
-                consumeField("value", 1)(valueConsumer.apply(mutableRow, 1))
+                consumeField("value", 1)(valueWriter.apply(mutableRow, 1))
               }
             }
           }

From e9638f01f7c09d5664ac6b36c26d1831c870f710 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 21:17:07 +0800
Subject: [PATCH 05/21] Optimizes writing structs

---
 .../sql/parquet/CatalystWriteSupport.scala    | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 555b29e3d9289..4218e6f308456 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -40,6 +40,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
   private var schema: StructType = _
 
+  private var rootFieldWriters: Seq[ValueWriter] = _
+
   private var recordConsumer: RecordConsumer = _
 
   private var followParquetFormatSpec: Boolean = _
@@ -53,6 +55,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   override def init(configuration: Configuration): WriteContext = {
     val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
     schema = StructType.fromString(schemaString)
+    rootFieldWriters = schema.map(_.dataType).map(makeWriter)
 
     assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null)
     followParquetFormatSpec =
@@ -78,17 +81,17 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 
   override def write(row: InternalRow): Unit = {
-    consumeMessage(writeFields(row, schema))
+    consumeMessage(writeFields(row, schema, rootFieldWriters))
   }
 
-  private def writeFields(row: InternalRow, schema: StructType): Unit = {
-    val writers = schema.map(_.dataType).map(makeWriter)
+  private def writeFields(
+      row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = {
     var i = 0
 
     while (i < row.numFields) {
       if (!row.isNullAt(i)) {
         consumeField(schema(i).name, i) {
-          writers(i).apply(row, i)
+          fieldWriters(i).apply(row, i)
         }
       }
 
@@ -163,9 +166,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
         }
 
-      case structType @ StructType(fields) =>
+      case structType: StructType =>
+        val fieldWriters = structType.map(_.dataType).map(makeWriter)
         (row: InternalRow, ordinal: Int) =>
-          consumeGroup(writeFields(row.getStruct(ordinal, fields.length), structType))
+          consumeGroup {
+            val struct = row.getStruct(ordinal, structType.length)
+            writeFields(struct, structType, fieldWriters)
+          }
 
       case arrayType: ArrayType if followParquetFormatSpec =>
         makeStandardArrayWriter(arrayType.elementType)

From a2aeba5dd834b7232d91c227f67d49375b185b56 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 27 Jul 2015 22:01:30 +0800
Subject: [PATCH 06/21] Fixes writing empty arrays and maps

---
 .../sql/parquet/CatalystWriteSupport.scala    | 59 ++++++++++---------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 4218e6f308456..54096b3515100 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -217,19 +217,19 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        consumeField(repeatedGroupName, 0) {
-          val array = row.get(ordinal).asInstanceOf[Seq[_]]
-          var i = 0
-
-          while (i < array.length) {
-            consumeGroup {
-              if (array(i) != null) {
-                mutableRow.update(0, array(i))
-                consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
+        val array = row.get(ordinal).asInstanceOf[Seq[_]]
+        if (array.nonEmpty) {
+          consumeField(repeatedGroupName, 0) {
+            var i = 0
+            while (i < array.length) {
+              consumeGroup {
+                if (array(i) != null) {
+                  mutableRow.update(0, array(i))
+                  consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
+                }
               }
+              i += 1
             }
-
-            i += 1
           }
         }
       }
@@ -244,14 +244,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        consumeField(repeatedFieldName, 0) {
-          val array = row.get(ordinal).asInstanceOf[Seq[_]]
-          var i = 0
-
-          while (i < array.length) {
-            mutableRow.update(0, array(i))
-            elementWriter.apply(mutableRow, 0)
-            i += 1
+        val array = row.get(ordinal).asInstanceOf[Seq[_]]
+        if (array.nonEmpty) {
+          consumeField(repeatedFieldName, 0) {
+            var i = 0
+            while (i < array.length) {
+              mutableRow.update(0, array(i))
+              elementWriter.apply(mutableRow, 0)
+              i += 1
+            }
           }
         }
       }
@@ -268,15 +269,17 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        consumeField(repeatedGroupName, 0) {
-          val map = row.get(ordinal).asInstanceOf[Map[_, _]]
-          for ((key, value) <- map) {
-            consumeGroup {
-              mutableRow.update(0, key)
-              consumeField("key", 0)(keyWriter.apply(mutableRow, 0))
-              if (value != null) {
-                mutableRow.update(1, value)
-                consumeField("value", 1)(valueWriter.apply(mutableRow, 1))
+        val map = row.get(ordinal).asInstanceOf[Map[_, _]]
+        if (map.nonEmpty) {
+          consumeField(repeatedGroupName, 0) {
+            for ((key, value) <- map) {
+              consumeGroup {
+                mutableRow.update(0, key)
+                consumeField("key", 0)(keyWriter.apply(mutableRow, 0))
+                if (value != null) {
+                  mutableRow.update(1, value)
+                  consumeField("value", 1)(valueWriter.apply(mutableRow, 1))
+                }
               }
             }
           }

From 678ccd4ec69ad5a082bec566a7270242d69f9fa5 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 28 Jul 2015 21:28:28 +0800
Subject: [PATCH 07/21] Migrates large decimal precision support

---
 .../org/apache/spark/sql/types/Decimal.scala  |  4 +-
 .../sql/parquet/CatalystRowConverter.scala    | 12 ++-
 .../sql/parquet/CatalystSchemaConverter.scala | 43 ++++-----
 .../sql/parquet/CatalystWriteSupport.scala    | 92 ++++++++++++++-----
 4 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index c0155eeb450a6..6754c4713830c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -106,7 +106,9 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    */
   def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = {
     this.decimalVal = decimal.setScale(scale, ROUNDING_MODE)
-    require(decimalVal.precision <= precision, "Overflowed precision")
+    require(
+      decimalVal.precision <= precision,
+      s"Precision overflow. Max precision: $precision, got: ${decimalVal.precision}")
     this.longVal = 0L
     this._precision = precision
     this._scale = scale
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
index 6938b071065cd..f294e12a8f005 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
@@ -264,7 +264,7 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
       val bytes = value.getBytes
 
-      if (precision <= 8) {
+      def bytesToUnscaledLong(bytes: Array[Byte]): Long = {
         // Constructs a `Decimal` with an unscaled `Long` value if possible.
         var unscaled = 0L
         var i = 0
@@ -275,11 +275,17 @@ private[parquet] class CatalystRowConverter(
         }
 
         val bits = 8 * bytes.length
-        unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+        (unscaled << (64 - bits)) >> (64 - bits)
+      }
+
+      if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        val unscaled = bytesToUnscaledLong(bytes)
         Decimal(unscaled, precision, scale)
       } else {
         // Otherwise, resorts to an unscaled `BigInteger` instead.
-        Decimal(new BigDecimal(new BigInteger(bytes), scale), precision, scale)
+        val unscaled = new BigInteger(bytes)
+        Decimal(new BigDecimal(unscaled, scale), precision, scale)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index 41b3c9d73e0af..ceb7a5479c9bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -25,6 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 import org.apache.parquet.schema.Type.Repetition._
 import org.apache.parquet.schema._
 
+import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLConf}
 
@@ -155,7 +156,7 @@ private[parquet] class CatalystSchemaConverter(
           case INT_16 => ShortType
           case INT_32 | null => IntegerType
           case DATE => DateType
-          case DECIMAL => makeDecimalType(maxPrecisionForBytes(4))
+          case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT32)
           case TIME_MILLIS => typeNotImplemented()
           case _ => illegalType()
         }
@@ -163,7 +164,7 @@ private[parquet] class CatalystSchemaConverter(
       case INT64 =>
         originalType match {
           case INT_64 | null => LongType
-          case DECIMAL => makeDecimalType(maxPrecisionForBytes(8))
+          case DECIMAL => makeDecimalType(MAX_PRECISION_FOR_INT64)
           case TIMESTAMP_MILLIS => typeNotImplemented()
           case _ => illegalType()
         }
@@ -396,7 +397,7 @@ private[parquet] class CatalystSchemaConverter(
           .as(DECIMAL)
           .precision(precision)
           .scale(scale)
-          .length(CatalystSchemaConverter.minBytesForPrecision(precision))
+          .length(minBytesForPrecision(precision))
           .named(field.name)
 
       // =====================================
@@ -405,7 +406,7 @@ private[parquet] class CatalystSchemaConverter(
 
       // Uses INT32 for 1 <= precision <= 9
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(4) && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
         Types
           .primitive(INT32, repetition)
           .as(DECIMAL)
@@ -413,9 +414,9 @@ private[parquet] class CatalystSchemaConverter(
           .scale(scale)
           .named(field.name)
 
-      // Uses INT64 for 1 <= precision <= 18
+      // Uses INT64 for 10 <= precision <= 18
       case DecimalType.Fixed(precision, scale)
-        if precision <= maxPrecisionForBytes(8) && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
         Types
           .primitive(INT64, repetition)
           .as(DECIMAL)
@@ -430,7 +431,7 @@ private[parquet] class CatalystSchemaConverter(
           .as(DECIMAL)
           .precision(precision)
           .scale(scale)
-          .length(CatalystSchemaConverter.minBytesForPrecision(precision))
+          .length(minBytesForPrecision(precision))
           .named(field.name)
 
       // ===================================================
@@ -534,14 +535,6 @@ private[parquet] class CatalystSchemaConverter(
         throw new AnalysisException(s"Unsupported data type $field.dataType")
     }
   }
-
-  // Max precision of a decimal value stored in `numBytes` bytes
-  private def maxPrecisionForBytes(numBytes: Int): Int = {
-    Math.round(                               // convert double to long
-      Math.floor(Math.log10(                  // number of base-10 digits
-        Math.pow(2, 8 * numBytes - 1) - 1)))  // max value stored in numBytes
-      .asInstanceOf[Int]
-  }
 }
 
 
@@ -566,7 +559,8 @@ private[parquet] object CatalystSchemaConverter {
     }
   }
 
-  private def computeMinBytesForPrecision(precision : Int) : Int = {
+  // The minimum number of bytes needed to store a decimal with a given `precision`.
+  val minBytesForPrecision = Array.tabulate[Int](DecimalType.MAX_PRECISION + 1) { precision =>
     var numBytes = 1
     while (math.pow(2.0, 8 * numBytes - 1) < math.pow(10.0, precision)) {
       numBytes += 1
@@ -574,14 +568,15 @@ private[parquet] object CatalystSchemaConverter {
     numBytes
   }
 
-  private val MIN_BYTES_FOR_PRECISION = Array.tabulate[Int](39)(computeMinBytesForPrecision)
+  val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4)
 
-  // Returns the minimum number of bytes needed to store a decimal with a given `precision`.
-  def minBytesForPrecision(precision : Int) : Int = {
-    if (precision < MIN_BYTES_FOR_PRECISION.length) {
-      MIN_BYTES_FOR_PRECISION(precision)
-    } else {
-      computeMinBytesForPrecision(precision)
-    }
+  val MAX_PRECISION_FOR_INT64 = maxPrecisionForBytes(8)
+
+  // Max precision of a decimal value stored in `numBytes` bytes
+  def maxPrecisionForBytes(numBytes: Int): Int = {
+    Math.round(                               // convert double to long
+      Math.floor(Math.log10(                  // number of base-10 digits
+        Math.pow(2, 8 * numBytes - 1) - 1)))  // max value stored in numBytes
+      .asInstanceOf[Int]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 54096b3515100..54b610dca770b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.parquet
 
 import java.nio.{ByteBuffer, ByteOrder}
+import java.util
 
 import scala.collection.JavaConverters.mapAsJavaMapConverter
 
@@ -33,24 +34,30 @@ import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
 import org.apache.spark.sql.types._
 
 private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
+  // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer
   type ValueWriter = (InternalRow, Int) => Unit
 
+  // Schema of the `InternalRow`s to be written
   private var schema: StructType = _
 
+  // `ValueWriter`s for all fields of the schema
   private var rootFieldWriters: Seq[ValueWriter] = _
 
+  // The Parquet `RecordConsumer` to which all `InternalRow`s are written
   private var recordConsumer: RecordConsumer = _
 
+  // Whether we should write standard Parquet data conforming to parquet-format spec or not
   private var followParquetFormatSpec: Boolean = _
 
-  // Byte array used to write timestamps as Parquet INT96 values
+  // Reusable byte array used to write timestamps as Parquet INT96 values
   private val timestampBuffer = new Array[Byte](12)
 
-  // Byte array used to write decimal values
-  private val decimalBuffer = new Array[Byte](8)
+  // Reusable byte array used to write decimal values
+  private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION))
 
   override def init(configuration: Configuration): WriteContext = {
     val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
@@ -145,26 +152,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
         (row: InternalRow, ordinal: Int) =>
           recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
 
-      case DecimalType.Fixed(precision, _) if precision > 18 =>
-        sys.error(s"Unsupported data type $dataType. Decimal precision cannot be greater than 18.")
-
       case DecimalType.Fixed(precision, _) =>
-        (row: InternalRow, ordinal: Int) => {
-          val decimal = row.getDecimal(ordinal)
-          val numBytes = CatalystWriteSupport.BYTES_FOR_PRECISION(precision)
-          val unscaledLong = decimal.toUnscaledLong
-
-          var i = 0
-          var shift = 8 * (numBytes - 1)
-
-          while (i < numBytes) {
-            decimalBuffer(i) = (unscaledLong >> shift).toByte
-            i += 1
-            shift -= 8
-          }
-
-          recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
-        }
+        makeDecimalWriter(precision)
 
       case structType: StructType =>
         val fieldWriters = structType.map(_.dataType).map(makeWriter)
@@ -194,6 +183,65 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
+  private def makeDecimalWriter(precision: Int): ValueWriter = {
+    assert(
+      precision <= DecimalType.MAX_PRECISION,
+      s"Precision overflow: $precision is greater than ${DecimalType.MAX_PRECISION}")
+
+    val numBytes = minBytesForPrecision(precision)
+
+    val int32Writer =
+      (row: InternalRow, ordinal: Int) =>
+        recordConsumer.addInteger(row.getDecimal(ordinal).toUnscaledLong.toInt)
+
+    val int64Writer =
+      (row: InternalRow, ordinal: Int) =>
+        recordConsumer.addLong(row.getDecimal(ordinal).toUnscaledLong)
+
+    val binaryWriterUsingUnscaledLong =
+      (row: InternalRow, ordinal: Int) => {
+        // This writer converts underlying unscaled Long value to raw bytes using a reusable byte
+        // array to minimize array allocation.
+
+        val unscaled = row.getDecimal(ordinal).toUnscaledLong
+        var i = 0
+        var shift = 8 * (numBytes - 1)
+
+        while (i < numBytes) {
+          decimalBuffer(i) = (unscaled >> shift).toByte
+          i += 1
+          shift -= 8
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+      }
+
+    val binaryWriterUsingUnscaledBytes =
+      (row: InternalRow, ordinal: Int) => {
+        val decimal = row.getDecimal(ordinal)
+        val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
+        util.Arrays.fill(decimalBuffer, 0: Byte)
+        System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
+        recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+      }
+
+    followParquetFormatSpec match {
+      // Standard mode, writes decimals with precision <= 9 as INT32
+      case true if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
+
+      // Standard mode, writes decimals with precision <= 18 as INT64
+      case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
+
+      // Legacy mode, writes decimals with precision <= 18 as BINARY
+      case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
+
+      // All other cases:
+      //  - Standard mode, writes decimals with precision > 18 as BINARY
+      //  - Legacy mode, writes decimals with all precision as BINARY
+      case _ => binaryWriterUsingUnscaledBytes
+    }
+  }
+
   private def makeStandardArrayWriter(elementType: DataType): ValueWriter = {
     makeThreeLevelArrayWriter(elementType, "list", "element")
   }

From b9f93dbdaf7c710e4ef48c828f5b305a66e549d9 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 28 Jul 2015 23:38:02 +0800
Subject: [PATCH 08/21] Fixes writing negative decimal values

---
 .../spark/sql/parquet/CatalystWriteSupport.scala    | 13 ++++++++++---
 .../apache/spark/sql/parquet/ParquetIOSuite.scala   |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 54b610dca770b..7df1192be9c57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -220,9 +220,16 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
       (row: InternalRow, ordinal: Int) => {
         val decimal = row.getDecimal(ordinal)
         val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
-        util.Arrays.fill(decimalBuffer, 0: Byte)
-        System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
-        recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+        val binary = if (bytes.length == numBytes) {
+          bytes
+        } else {
+          val signByte = if (bytes.head < 0) -1: Byte else 0: Byte
+          util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte)
+          System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
+          decimalBuffer
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(binary, 0, numBytes))
       }
 
     followParquetFormatSpec match {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 2ea64c715b9a3..24ab233d22cb9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -101,7 +101,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
     def makeDecimalRDD(decimal: DecimalType): DataFrame =
       sqlContext.sparkContext
         .parallelize(0 to 1000)
-        .map(i => Tuple1(i / 100.0))
+        .map(i => Tuple1((i - 500) / 100.0))
         .toDF()
         // Parquet doesn't allow column names with spaces, have to add an alias here
         .select($"_1" cast decimal as "dec")

From 0e0a957d25b94b8121867d24730490b7b4bde2be Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 29 Jul 2015 01:37:38 +0800
Subject: [PATCH 09/21] Minor comment updates

---
 .../apache/spark/sql/parquet/CatalystWriteSupport.scala   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 7df1192be9c57..ec87d5bc88bdc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -141,7 +141,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
 
       case TimestampType =>
+        // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
         (row: InternalRow, ordinal: Int) => {
+          // Actually Spark SQL `TimestampType` only has microsecond precision.
           val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
           val buf = ByteBuffer.wrap(timestampBuffer)
           buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
@@ -239,12 +241,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
       // Standard mode, writes decimals with precision <= 18 as INT64
       case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
 
-      // Legacy mode, writes decimals with precision <= 18 as BINARY
+      // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY
       case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
 
       // All other cases:
-      //  - Standard mode, writes decimals with precision > 18 as BINARY
-      //  - Legacy mode, writes decimals with all precision as BINARY
+      //  - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
+      //  - Legacy mode, writes decimals with all precision as FIXED_LEN_BYTE_ARRAY
       case _ => binaryWriterUsingUnscaledBytes
     }
   }

From 2859132e33a1d8d4e86016feca1d0876d162b380 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 29 Jul 2015 17:35:36 +0800
Subject: [PATCH 10/21] Fixes array type conversion in legacy mode

---
 .../sql/parquet/CatalystSchemaConverter.scala      | 14 ++++++++------
 .../spark/sql/parquet/ParquetSchemaSuite.scala     |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
index ceb7a5479c9bb..acfa1b1ab6443 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
@@ -439,12 +439,14 @@ private[parquet] class CatalystSchemaConverter(
       // ===================================================
 
       // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
-      // LIST structure.  This behavior mimics parquet-hive (1.6.0rc3).  Note that this case is
-      // covered by the backwards-compatibility rules implemented in `isElementType()`.
+      // LIST structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
+      // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous
+      // field name "array" is from parquet-avro. Note that this case is covered by the backwards-
+      // compatibility rules implemented in `isElementType()`.
       case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
         // <list-repetition> group <name> (LIST) {
         //   optional group bag {
-        //     repeated <element-type> element;
+        //     repeated <element-type> array;
         //   }
         // }
         ConversionPatterns.listType(
@@ -452,8 +454,8 @@ private[parquet] class CatalystSchemaConverter(
           field.name,
           Types
             .buildGroup(REPEATED)
-            // "array_element" is the name chosen by parquet-hive (1.7.0 and prior version)
-            .addField(convertField(StructField("array_element", elementType, nullable)))
+            // "array" is the name chosen by Spark SQL 1.4.0 and prior versions
+            .addField(convertField(StructField("array", elementType, nullable)))
             .named("bag"))
 
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
@@ -461,7 +463,7 @@ private[parquet] class CatalystSchemaConverter(
       // covered by the backwards-compatibility rules implemented in `isElementType()`.
       case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec =>
         // <list-repetition> group <name> (LIST) {
-        //   repeated <element-type> element;
+        //   repeated <element-type> array;
         // }
         ConversionPatterns.listType(
           repetition,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 7deba1cf6ebfc..ec6e37cd5ee85 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional group _1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}
@@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |        optional binary _1 (UTF8);
       |        optional group _2 (LIST) {
       |          repeated group bag {
-      |            optional group array_element {
+      |            optional group array {
       |              required int32 _1;
       |              required double _2;
       |            }
@@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional group f1 (LIST) {
       |    repeated group bag {
-      |      optional int32 array_element;
+      |      optional int32 array;
       |    }
       |  }
       |}

From 62c482930a61427e0eed7257080abcf23fee25b0 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 29 Jul 2015 18:12:35 +0800
Subject: [PATCH 11/21] Minor refactoring

---
 .../sql/parquet/CatalystWriteSupport.scala    | 88 ++++++++-----------
 1 file changed, 36 insertions(+), 52 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index ec87d5bc88bdc..3ffb7cb0dd180 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -157,28 +157,28 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
       case DecimalType.Fixed(precision, _) =>
         makeDecimalWriter(precision)
 
-      case structType: StructType =>
-        val fieldWriters = structType.map(_.dataType).map(makeWriter)
+      case t: StructType =>
+        val fieldWriters = t.map(_.dataType).map(makeWriter)
         (row: InternalRow, ordinal: Int) =>
-          consumeGroup {
-            val struct = row.getStruct(ordinal, structType.length)
-            writeFields(struct, structType, fieldWriters)
-          }
+          consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters))
+
+      case ArrayType(elementType, _) if followParquetFormatSpec =>
+        makeThreeLevelArrayWriter(elementType, "list", "element")
 
-      case arrayType: ArrayType if followParquetFormatSpec =>
-        makeStandardArrayWriter(arrayType.elementType)
+      case ArrayType(elementType, true) if !followParquetFormatSpec =>
+        makeThreeLevelArrayWriter(elementType, "bag", "array")
 
-      case arrayType: ArrayType if !followParquetFormatSpec =>
-        makeLegacyArrayWriter(arrayType.elementType, arrayType.containsNull)
+      case ArrayType(elementType, false) if !followParquetFormatSpec =>
+        makeTwoLevelArrayWriter(elementType, "array")
 
-      case mapType: MapType if followParquetFormatSpec =>
-        makeMapWriter(mapType.keyType, mapType.valueType, "key_value")
+      case t: MapType if followParquetFormatSpec =>
+        makeMapWriter(t, "key_value")
 
-      case mapType: MapType if !followParquetFormatSpec =>
-        makeMapWriter(mapType.keyType, mapType.valueType, "map")
+      case t: MapType if !followParquetFormatSpec =>
+        makeMapWriter(t, "map")
 
-      case udt: UserDefinedType[_] =>
-        makeWriter(udt.sqlType)
+      case t: UserDefinedType[_] =>
+        makeWriter(t.sqlType)
 
       case _ =>
         sys.error(s"Unsupported data type $dataType.")
@@ -202,9 +202,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     val binaryWriterUsingUnscaledLong =
       (row: InternalRow, ordinal: Int) => {
-        // This writer converts underlying unscaled Long value to raw bytes using a reusable byte
-        // array to minimize array allocation.
-
+        // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we
+        // can build a fixed-length byte array with length `numBytes` using the unscaled `Long`
+        // value and the `decimalBuffer` for better performance.
         val unscaled = row.getDecimal(ordinal).toUnscaledLong
         var i = 0
         var shift = 8 * (numBytes - 1)
@@ -220,18 +220,22 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     val binaryWriterUsingUnscaledBytes =
       (row: InternalRow, ordinal: Int) => {
-        val decimal = row.getDecimal(ordinal)
-        val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
-        val binary = if (bytes.length == numBytes) {
+        val bytes = row.getDecimal(ordinal).toJavaBigDecimal.unscaledValue().toByteArray
+        val fixedLengthBytes = if (bytes.length == numBytes) {
+          // If the length of the underlying byte array of the unscaled `BigInteger` happens to be
+          // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`.
           bytes
         } else {
+          // Otherwise, the length must be less than `numBytes`.  In this case we copy contents of
+          // the underlying bytes with enough sign bytes to `decimalBuffer` to form the result
+          // fixed-length byte array.
           val signByte = if (bytes.head < 0) -1: Byte else 0: Byte
           util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte)
           System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
           decimalBuffer
         }
 
-        recordConsumer.addBinary(Binary.fromByteArray(binary, 0, numBytes))
+        recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes))
       }
 
     followParquetFormatSpec match {
@@ -251,30 +255,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeStandardArrayWriter(elementType: DataType): ValueWriter = {
-    makeThreeLevelArrayWriter(elementType, "list", "element")
-  }
-
-  private def makeLegacyArrayWriter(
-      elementType: DataType,
-      containsNull: Boolean): ValueWriter = {
-    if (containsNull) {
-      makeThreeLevelArrayWriter(elementType, "bag", "array")
-    } else {
-      makeTwoLevelArrayWriter(elementType, "array")
-    }
-  }
-
   private def makeThreeLevelArrayWriter(
-      elementType: DataType,
-      repeatedGroupName: String,
-      elementFieldName: String): ValueWriter = {
+      elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = {
     val elementWriter = makeWriter(elementType)
     val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val array = row.get(ordinal).asInstanceOf[Seq[_]]
+        val array = row.genericGet(ordinal).asInstanceOf[Seq[_]]
         if (array.nonEmpty) {
           consumeField(repeatedGroupName, 0) {
             var i = 0
@@ -294,14 +282,13 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 
   private def makeTwoLevelArrayWriter(
-      elementType: DataType,
-      repeatedFieldName: String): ValueWriter = {
+      elementType: DataType, repeatedFieldName: String): ValueWriter = {
     val elementWriter = makeWriter(elementType)
     val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val array = row.get(ordinal).asInstanceOf[Seq[_]]
+        val array = row.genericGet(ordinal).asInstanceOf[Seq[_]]
         if (array.nonEmpty) {
           consumeField(repeatedFieldName, 0) {
             var i = 0
@@ -316,17 +303,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeMapWriter(
-      keyType: DataType,
-      valueType: DataType,
-      repeatedGroupName: String): ValueWriter = {
-    val keyWriter = makeWriter(keyType)
-    val valueWriter = makeWriter(valueType)
-    val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
+  private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = {
+    val keyWriter = makeWriter(mapType.keyType)
+    val valueWriter = makeWriter(mapType.valueType)
+    val mutableRow = new SpecificMutableRow(mapType.keyType :: mapType.valueType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val map = row.get(ordinal).asInstanceOf[Map[_, _]]
+        val map = row.get(ordinal, mapType).asInstanceOf[Map[_, _]]
         if (map.nonEmpty) {
           consumeField(repeatedGroupName, 0) {
             for ((key, value) <- map) {

From b37fe7724fa6780ae4c5952d71a4e6b2e10fd959 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 31 Jul 2015 18:26:25 +0800
Subject: [PATCH 12/21] Fixes compilation error introduced while rebasing

---
 .../sql/parquet/CatalystWriteSupport.scala    | 19 ++++++++++---------
 .../spark/sql/parquet/ParquetRelation.scala   |  3 +--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 3ffb7cb0dd180..db1d418fc1714 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -154,8 +154,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
         (row: InternalRow, ordinal: Int) =>
           recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
 
-      case DecimalType.Fixed(precision, _) =>
-        makeDecimalWriter(precision)
+      case DecimalType.Fixed(precision, scale) =>
+        makeDecimalWriter(precision, scale)
 
       case t: StructType =>
         val fieldWriters = t.map(_.dataType).map(makeWriter)
@@ -185,7 +185,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     }
   }
 
-  private def makeDecimalWriter(precision: Int): ValueWriter = {
+  private def makeDecimalWriter(precision: Int, scale: Int): ValueWriter = {
     assert(
       precision <= DecimalType.MAX_PRECISION,
       s"Precision overflow: $precision is greater than ${DecimalType.MAX_PRECISION}")
@@ -194,18 +194,18 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     val int32Writer =
       (row: InternalRow, ordinal: Int) =>
-        recordConsumer.addInteger(row.getDecimal(ordinal).toUnscaledLong.toInt)
+        recordConsumer.addInteger(row.getDecimal(ordinal, precision, scale).toUnscaledLong.toInt)
 
     val int64Writer =
       (row: InternalRow, ordinal: Int) =>
-        recordConsumer.addLong(row.getDecimal(ordinal).toUnscaledLong)
+        recordConsumer.addLong(row.getDecimal(ordinal, precision, scale).toUnscaledLong)
 
     val binaryWriterUsingUnscaledLong =
       (row: InternalRow, ordinal: Int) => {
         // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we
         // can build a fixed-length byte array with length `numBytes` using the unscaled `Long`
         // value and the `decimalBuffer` for better performance.
-        val unscaled = row.getDecimal(ordinal).toUnscaledLong
+        val unscaled = row.getDecimal(ordinal, precision, scale).toUnscaledLong
         var i = 0
         var shift = 8 * (numBytes - 1)
 
@@ -220,14 +220,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     val binaryWriterUsingUnscaledBytes =
       (row: InternalRow, ordinal: Int) => {
-        val bytes = row.getDecimal(ordinal).toJavaBigDecimal.unscaledValue().toByteArray
+        val decimal = row.getDecimal(ordinal, precision, scale)
+        val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
         val fixedLengthBytes = if (bytes.length == numBytes) {
           // If the length of the underlying byte array of the unscaled `BigInteger` happens to be
           // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`.
           bytes
         } else {
           // Otherwise, the length must be less than `numBytes`.  In this case we copy contents of
-          // the underlying bytes with enough sign bytes to `decimalBuffer` to form the result
+          // the underlying bytes with padding sign bytes to `decimalBuffer` to form the result
           // fixed-length byte array.
           val signByte = if (bytes.head < 0) -1: Byte else 0: Byte
           util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte)
@@ -250,7 +251,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
       // All other cases:
       //  - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
-      //  - Legacy mode, writes decimals with all precision as FIXED_LEN_BYTE_ARRAY
+      //  - Legacy mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
       case _ => binaryWriterUsingUnscaledBytes
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index 2d98792385664..b7202375dc128 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -38,12 +38,11 @@ import org.apache.parquet.schema.MessageType
 import org.apache.parquet.{Log => ParquetLog}
 
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.rdd.{SqlNewHadoopPartition, SqlNewHadoopRDD, RDD}
 import org.apache.spark.rdd.RDD._
+import org.apache.spark.rdd.{RDD, SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources.PartitionSpec
-import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}

From 0d61b3bda95bd825f9a667dcfe8d0f5df8e6c594 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Fri, 31 Jul 2015 19:04:47 +0800
Subject: [PATCH 13/21] Writes arrays using ArrayData

---
 .../sql/parquet/CatalystWriteSupport.scala     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index db1d418fc1714..0606d03a1b9f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -263,14 +263,14 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val array = row.genericGet(ordinal).asInstanceOf[Seq[_]]
-        if (array.nonEmpty) {
+        val array = row.getArray(ordinal)
+        if (array.numElements() > 0) {
           consumeField(repeatedGroupName, 0) {
             var i = 0
-            while (i < array.length) {
+            while (i < array.numElements()) {
               consumeGroup {
-                if (array(i) != null) {
-                  mutableRow.update(0, array(i))
+                if (!array.isNullAt(i)) {
+                  mutableRow.update(0, array.get(i))
                   consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
                 }
               }
@@ -289,12 +289,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val array = row.genericGet(ordinal).asInstanceOf[Seq[_]]
-        if (array.nonEmpty) {
+        val array = row.getArray(ordinal)
+        if (array.numElements() > 0) {
           consumeField(repeatedFieldName, 0) {
             var i = 0
-            while (i < array.length) {
-              mutableRow.update(0, array(i))
+            while (i < array.numElements()) {
+              mutableRow.update(0, array.get(i))
               elementWriter.apply(mutableRow, 0)
               i += 1
             }

From f901a16d10ce7ac7c6be0e54a3c15669e9189ad4 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 1 Aug 2015 22:54:12 +0800
Subject: [PATCH 14/21] Writes maps using MapData

---
 .../sql/parquet/CatalystWriteSupport.scala    | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 0606d03a1b9f8..0b28543744d93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.types._
 
 private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
   // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer
-  type ValueWriter = (InternalRow, Int) => Unit
+  private type ValueWriter = (InternalRow, Int) => Unit
 
   // Schema of the `InternalRow`s to be written
   private var schema: StructType = _
@@ -270,7 +270,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
             while (i < array.numElements()) {
               consumeGroup {
                 if (!array.isNullAt(i)) {
-                  mutableRow.update(0, array.get(i))
+                  mutableRow.update(0, array.get(i, elementType))
                   consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
                 }
               }
@@ -294,7 +294,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
           consumeField(repeatedFieldName, 0) {
             var i = 0
             while (i < array.numElements()) {
-              mutableRow.update(0, array.get(i))
+              mutableRow.update(0, array.get(i, elementType))
               elementWriter.apply(mutableRow, 0)
               i += 1
             }
@@ -305,24 +305,28 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 
   private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = {
-    val keyWriter = makeWriter(mapType.keyType)
-    val valueWriter = makeWriter(mapType.valueType)
-    val mutableRow = new SpecificMutableRow(mapType.keyType :: mapType.valueType :: Nil)
+    val keyType = mapType.keyType
+    val valueType = mapType.valueType
+    val keyWriter = makeWriter(keyType)
+    val valueWriter = makeWriter(valueType)
+    val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
 
     (row: InternalRow, ordinal: Int) => {
       consumeGroup {
-        val map = row.get(ordinal, mapType).asInstanceOf[Map[_, _]]
-        if (map.nonEmpty) {
+        val map = row.getMap(ordinal)
+        if (map.numElements() > 0) {
           consumeField(repeatedGroupName, 0) {
-            for ((key, value) <- map) {
+            var i = 0
+            while (i < map.numElements()) {
               consumeGroup {
-                mutableRow.update(0, key)
+                mutableRow.update(0, map.keyArray().get(i, keyType))
                 consumeField("key", 0)(keyWriter.apply(mutableRow, 0))
-                if (value != null) {
-                  mutableRow.update(1, value)
+                if (!map.valueArray().isNullAt(i)) {
+                  mutableRow.update(1, map.valueArray().get(i, valueType))
                   consumeField("value", 1)(valueWriter.apply(mutableRow, 1))
                 }
               }
+              i += 1
             }
           }
         }

From 5127b8d0fd8584e45b447a6e76ef58e572961a7e Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 00:04:24 +0800
Subject: [PATCH 15/21] Retrieves data from SpecializedGetters

---
 .../sql/parquet/CatalystWriteSupport.scala    | 54 +++++++++----------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
index 0b28543744d93..553d05e8c6b76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
@@ -32,14 +32,16 @@ import org.apache.parquet.io.api.{Binary, RecordConsumer}
 import org.apache.spark.Logging
 import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
+import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificMutableRow}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
 import org.apache.spark.sql.types._
 
 private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
-  // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer
-  private type ValueWriter = (InternalRow, Int) => Unit
+  // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer.
+  // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access
+  // data in `ArrayData` without the help of `SpecificMutableRow`.
+  private type ValueWriter = (SpecializedGetters, Int) => Unit
 
   // Schema of the `InternalRow`s to be written
   private var schema: StructType = _
@@ -94,14 +96,12 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   private def writeFields(
       row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = {
     var i = 0
-
     while (i < row.numFields) {
       if (!row.isNullAt(i)) {
         consumeField(schema(i).name, i) {
           fieldWriters(i).apply(row, i)
         }
       }
-
       i += 1
     }
   }
@@ -109,40 +109,40 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   private def makeWriter(dataType: DataType): ValueWriter = {
     dataType match {
       case BooleanType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addBoolean(row.getBoolean(ordinal))
 
       case ByteType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addInteger(row.getByte(ordinal))
 
       case ShortType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addInteger(row.getShort(ordinal))
 
       case IntegerType | DateType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addInteger(row.getInt(ordinal))
 
       case LongType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addLong(row.getLong(ordinal))
 
       case FloatType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addFloat(row.getFloat(ordinal))
 
       case DoubleType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addDouble(row.getDouble(ordinal))
 
       case StringType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
 
       case TimestampType =>
         // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
-        (row: InternalRow, ordinal: Int) => {
+        (row: SpecializedGetters, ordinal: Int) => {
           // Actually Spark SQL `TimestampType` only has microsecond precision.
           val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
           val buf = ByteBuffer.wrap(timestampBuffer)
@@ -151,7 +151,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
         }
 
       case BinaryType =>
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
 
       case DecimalType.Fixed(precision, scale) =>
@@ -159,7 +159,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
 
       case t: StructType =>
         val fieldWriters = t.map(_.dataType).map(makeWriter)
-        (row: InternalRow, ordinal: Int) =>
+        (row: SpecializedGetters, ordinal: Int) =>
           consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters))
 
       case ArrayType(elementType, _) if followParquetFormatSpec =>
@@ -193,15 +193,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     val numBytes = minBytesForPrecision(precision)
 
     val int32Writer =
-      (row: InternalRow, ordinal: Int) =>
+      (row: SpecializedGetters, ordinal: Int) =>
         recordConsumer.addInteger(row.getDecimal(ordinal, precision, scale).toUnscaledLong.toInt)
 
     val int64Writer =
-      (row: InternalRow, ordinal: Int) =>
+      (row: SpecializedGetters, ordinal: Int) =>
         recordConsumer.addLong(row.getDecimal(ordinal, precision, scale).toUnscaledLong)
 
     val binaryWriterUsingUnscaledLong =
-      (row: InternalRow, ordinal: Int) => {
+      (row: SpecializedGetters, ordinal: Int) => {
         // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we
         // can build a fixed-length byte array with length `numBytes` using the unscaled `Long`
         // value and the `decimalBuffer` for better performance.
@@ -219,7 +219,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
       }
 
     val binaryWriterUsingUnscaledBytes =
-      (row: InternalRow, ordinal: Int) => {
+      (row: SpecializedGetters, ordinal: Int) => {
         val decimal = row.getDecimal(ordinal, precision, scale)
         val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
         val fixedLengthBytes = if (bytes.length == numBytes) {
@@ -259,9 +259,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   private def makeThreeLevelArrayWriter(
       elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = {
     val elementWriter = makeWriter(elementType)
-    val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
-    (row: InternalRow, ordinal: Int) => {
+    (row: SpecializedGetters, ordinal: Int) => {
       consumeGroup {
         val array = row.getArray(ordinal)
         if (array.numElements() > 0) {
@@ -270,8 +269,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
             while (i < array.numElements()) {
               consumeGroup {
                 if (!array.isNullAt(i)) {
-                  mutableRow.update(0, array.get(i, elementType))
-                  consumeField(elementFieldName, 0)(elementWriter.apply(mutableRow, 0))
+                  consumeField(elementFieldName, 0)(elementWriter.apply(array, i))
                 }
               }
               i += 1
@@ -285,17 +283,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   private def makeTwoLevelArrayWriter(
       elementType: DataType, repeatedFieldName: String): ValueWriter = {
     val elementWriter = makeWriter(elementType)
-    val mutableRow = new SpecificMutableRow(elementType :: Nil)
 
-    (row: InternalRow, ordinal: Int) => {
+    (row: SpecializedGetters, ordinal: Int) => {
       consumeGroup {
         val array = row.getArray(ordinal)
         if (array.numElements() > 0) {
           consumeField(repeatedFieldName, 0) {
             var i = 0
             while (i < array.numElements()) {
-              mutableRow.update(0, array.get(i, elementType))
-              elementWriter.apply(mutableRow, 0)
+              elementWriter.apply(array, i)
               i += 1
             }
           }
@@ -311,7 +307,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     val valueWriter = makeWriter(valueType)
     val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
 
-    (row: InternalRow, ordinal: Int) => {
+    (row: SpecializedGetters, ordinal: Int) => {
       consumeGroup {
         val map = row.getMap(ordinal)
         if (map.numElements() > 0) {

From 1f1d4af761760ac99de8fdccae30a13851304d26 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 15:49:22 +0800
Subject: [PATCH 16/21] Renames "Catalyst*" classes to "Parquet*"

---
 ...Support.scala => ParquetReadSupport.scala} | 22 ++++++-------
 ....scala => ParquetRecordMaterializer.scala} |  4 +--
 .../spark/sql/parquet/ParquetRelation.scala   | 24 +++++++-------
 ...verter.scala => ParquetRowConverter.scala} | 32 +++++++++----------
 ...ter.scala => ParquetSchemaConverter.scala} | 22 ++++++-------
 ...upport.scala => ParquetWriteSupport.scala} | 25 +++++++++------
 .../spark/sql/parquet/ParquetIOSuite.scala    |  4 +--
 .../sql/parquet/ParquetSchemaSuite.scala      |  4 +--
 .../spark/sql/parquet/ParquetTest.scala       |  4 +--
 9 files changed, 73 insertions(+), 68 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystReadSupport.scala => ParquetReadSupport.scala} (88%)
 rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystRecordMaterializer.scala => ParquetRecordMaterializer.scala} (90%)
 rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystRowConverter.scala => ParquetRowConverter.scala} (93%)
 rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystSchemaConverter.scala => ParquetSchemaConverter.scala} (97%)
 rename sql/core/src/main/scala/org/apache/spark/sql/parquet/{CatalystWriteSupport.scala => ParquetWriteSupport.scala} (92%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
similarity index 88%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
index 9648035744c1d..b67d15264c415 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
@@ -31,7 +31,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.StructType
 
-private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with Logging {
+private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with Logging {
   override def prepareForRead(
       conf: Configuration,
       keyValueMetaData: JMap[String, String],
@@ -39,24 +39,24 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
       readContext: ReadContext): RecordMaterializer[InternalRow] = {
     log.debug(s"Preparing for read Parquet file with message type: $fileSchema")
 
-    val toCatalyst = new CatalystSchemaConverter(conf)
+    val toCatalyst = new ParquetSchemaConverter(conf)
     val parquetRequestedSchema = readContext.getRequestedSchema
 
     val catalystRequestedSchema =
       Option(readContext.getReadSupportMetadata).map(_.toMap).flatMap { metadata =>
         metadata
           // First tries to read requested schema, which may result from projections
-          .get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
+          .get(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
           // If not available, tries to read Catalyst schema from file metadata.  It's only
           // available if the target file is written by Spark SQL.
-          .orElse(metadata.get(CatalystReadSupport.SPARK_METADATA_KEY))
+          .orElse(metadata.get(ParquetReadSupport.SPARK_METADATA_KEY))
       }.map(StructType.fromString).getOrElse {
         logDebug("Catalyst schema not available, falling back to Parquet schema")
         toCatalyst.convert(parquetRequestedSchema)
       }
 
     logDebug(s"Catalyst schema used to read Parquet files: $catalystRequestedSchema")
-    new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
+    new ParquetRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
   }
 
   override def init(context: InitContext): ReadContext = {
@@ -64,11 +64,11 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
     // If the target file was written by Spark SQL, we should be able to find a serialized Catalyst
     // schema of this file from its the metadata.
-    val maybeRowSchema = Option(conf.get(CatalystWriteSupport.SPARK_ROW_SCHEMA))
+    val maybeRowSchema = Option(conf.get(ParquetWriteSupport.SPARK_ROW_SCHEMA))
 
     // Optional schema of requested columns, in the form of a string serialized from a Catalyst
     // `StructType` containing all requested columns.
-    val maybeRequestedSchema = Option(conf.get(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
+    val maybeRequestedSchema = Option(conf.get(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA))
 
     // Below we construct a Parquet schema containing all requested columns.  This schema tells
     // Parquet which columns to read.
@@ -110,7 +110,7 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
     //    different physical structures.
     val parquetRequestedSchema =
       maybeRequestedSchema.fold(context.getFileSchema) { schemaString =>
-        val toParquet = new CatalystSchemaConverter(conf)
+        val toParquet = new ParquetSchemaConverter(conf)
         val fileSchema = context.getFileSchema.asGroupType()
         val fileFieldNames = fileSchema.getFields.map(_.getName).toSet
 
@@ -138,15 +138,15 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
 
     val metadata =
       Map.empty[String, String] ++
-        maybeRequestedSchema.map(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
-        maybeRowSchema.map(CatalystWriteSupport.SPARK_ROW_SCHEMA -> _)
+        maybeRequestedSchema.map(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA -> _) ++
+        maybeRowSchema.map(ParquetWriteSupport.SPARK_ROW_SCHEMA -> _)
 
     logInfo(s"Going to read Parquet file with these requested columns: $parquetRequestedSchema")
     new ReadContext(parquetRequestedSchema, metadata)
   }
 }
 
-private[parquet] object CatalystReadSupport {
+private[parquet] object ParquetReadSupport {
   val SPARK_ROW_REQUESTED_SCHEMA = "org.apache.spark.sql.parquet.row.requested_schema"
 
   val SPARK_METADATA_KEY = "org.apache.spark.sql.parquet.row.metadata"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala
similarity index 90%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala
index 84f1dccfeb788..8c1971dbe4d65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRecordMaterializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRecordMaterializer.scala
@@ -29,11 +29,11 @@ import org.apache.spark.sql.types.StructType
  * @param parquetSchema Parquet schema of the records to be read
  * @param catalystSchema Catalyst schema of the rows to be constructed
  */
-private[parquet] class CatalystRecordMaterializer(
+private[parquet] class ParquetRecordMaterializer(
     parquetSchema: MessageType, catalystSchema: StructType)
   extends RecordMaterializer[InternalRow] {
 
-  private val rootConverter = new CatalystRowConverter(parquetSchema, catalystSchema, NoopUpdater)
+  private val rootConverter = new ParquetRowConverter(parquetSchema, catalystSchema, NoopUpdater)
 
   override def getCurrentRecord: InternalRow = rootConverter.currentRow
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index b7202375dc128..f2e0f78da4433 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -228,8 +228,8 @@ private[sql] class ParquetRelation(
     // bundled with `ParquetOutputFormat[Row]`.
     job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
 
-    ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport])
-    CatalystWriteSupport.setSchema(dataSchema, conf)
+    ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetWriteSupport])
+    ParquetWriteSupport.setSchema(dataSchema, conf)
 
     // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema)
     conf.set(
@@ -474,7 +474,7 @@ private[sql] object ParquetRelation extends Logging {
       assumeBinaryIsString: Boolean,
       assumeInt96IsTimestamp: Boolean)(job: Job): Unit = {
     val conf = job.getConfiguration
-    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
+    conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[ParquetReadSupport].getName)
 
     // Try to push down filters when filter push-down is enabled.
     if (parquetFilterPushDown) {
@@ -487,14 +487,14 @@ private[sql] object ParquetRelation extends Logging {
         .foreach(ParquetInputFormat.setFilterPredicate(conf, _))
     }
 
-    conf.set(CatalystReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
+    conf.set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA, {
       val requestedSchema = StructType(requiredColumns.map(dataSchema(_)))
-      CatalystSchemaConverter.checkFieldNames(requestedSchema).json
+      ParquetSchemaConverter.checkFieldNames(requestedSchema).json
     })
 
     conf.set(
-      CatalystWriteSupport.SPARK_ROW_SCHEMA,
-      CatalystSchemaConverter.checkFieldNames(dataSchema).json)
+      ParquetWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetSchemaConverter.checkFieldNames(dataSchema).json)
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
@@ -518,7 +518,7 @@ private[sql] object ParquetRelation extends Logging {
       footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = {
 
     def parseParquetSchema(schema: MessageType): StructType = {
-      val converter = new CatalystSchemaConverter(
+      val converter = new ParquetSchemaConverter(
         sqlContext.conf.isParquetBinaryAsString,
         sqlContext.conf.isParquetBinaryAsString,
         sqlContext.conf.followParquetFormatSpec)
@@ -532,7 +532,7 @@ private[sql] object ParquetRelation extends Logging {
       val serializedSchema = metadata
         .getKeyValueMetaData
         .toMap
-        .get(CatalystReadSupport.SPARK_METADATA_KEY)
+        .get(ParquetReadSupport.SPARK_METADATA_KEY)
       if (serializedSchema.isEmpty) {
         // Falls back to Parquet schema if no Spark SQL schema found.
         Some(parseParquetSchema(metadata.getSchema))
@@ -692,7 +692,7 @@ private[sql] object ParquetRelation extends Logging {
 
           // Converter used to convert Parquet `MessageType` to Spark SQL `StructType`
           val converter =
-            new CatalystSchemaConverter(
+            new ParquetSchemaConverter(
               assumeBinaryIsString = assumeBinaryIsString,
               assumeInt96IsTimestamp = assumeInt96IsTimestamp,
               followParquetFormatSpec = followParquetFormatSpec)
@@ -711,12 +711,12 @@ private[sql] object ParquetRelation extends Logging {
    * a [[StructType]] converted from the [[MessageType]] stored in this footer.
    */
   def readSchemaFromFooter(
-      footer: Footer, converter: CatalystSchemaConverter): StructType = {
+      footer: Footer, converter: ParquetSchemaConverter): StructType = {
     val fileMetaData = footer.getParquetMetadata.getFileMetaData
     fileMetaData
       .getKeyValueMetaData
       .toMap
-      .get(CatalystReadSupport.SPARK_METADATA_KEY)
+      .get(ParquetReadSupport.SPARK_METADATA_KEY)
       .flatMap(deserializeSchemaString)
       .getOrElse(converter.convert(fileMetaData.getSchema))
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala
similarity index 93%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala
index f294e12a8f005..48ab2f3848675 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRowConverter.scala
@@ -56,7 +56,7 @@ private[parquet] trait ParentContainerUpdater {
 private[parquet] object NoopUpdater extends ParentContainerUpdater
 
 /**
- * A [[CatalystRowConverter]] is used to convert Parquet "structs" into Spark SQL [[InternalRow]]s.
+ * A [[ParquetRowConverter]] is used to convert Parquet records into Spark SQL [[InternalRow]]s.
  * Since any Parquet record is also a struct, this converter can also be used as root converter.
  *
  * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have
@@ -66,14 +66,14 @@ private[parquet] object NoopUpdater extends ParentContainerUpdater
  * @param catalystType Spark SQL schema that corresponds to the Parquet record type
  * @param updater An updater which propagates converted field values to the parent container
  */
-private[parquet] class CatalystRowConverter(
+private[parquet] class ParquetRowConverter(
     parquetType: GroupType,
     catalystType: StructType,
     updater: ParentContainerUpdater)
   extends GroupConverter {
 
   /**
-   * Updater used together with field converters within a [[CatalystRowConverter]].  It propagates
+   * Updater used together with field converters within a [[ParquetRowConverter]].  It propagates
    * converted filed values to the `ordinal`-th cell in `currentRow`.
    */
   private final class RowUpdater(row: MutableRow, ordinal: Int) extends ParentContainerUpdater {
@@ -126,7 +126,7 @@ private[parquet] class CatalystRowConverter(
 
     catalystType match {
       case BooleanType | IntegerType | LongType | FloatType | DoubleType | BinaryType =>
-        new CatalystPrimitiveConverter(updater)
+        new ParquetPrimitiveConverter(updater)
 
       case ByteType =>
         new PrimitiveConverter {
@@ -141,10 +141,10 @@ private[parquet] class CatalystRowConverter(
         }
 
       case t: DecimalType =>
-        new CatalystDecimalConverter(t, updater)
+        new ParquetDecimalConverter(t, updater)
 
       case StringType =>
-        new CatalystStringConverter(updater)
+        new ParquetStringConverter(updater)
 
       case TimestampType =>
         // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
@@ -172,13 +172,13 @@ private[parquet] class CatalystRowConverter(
         }
 
       case t: ArrayType =>
-        new CatalystArrayConverter(parquetType.asGroupType(), t, updater)
+        new ParquetArrayConverter(parquetType.asGroupType(), t, updater)
 
       case t: MapType =>
-        new CatalystMapConverter(parquetType.asGroupType(), t, updater)
+        new ParquetMapConverter(parquetType.asGroupType(), t, updater)
 
       case t: StructType =>
-        new CatalystRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater {
+        new ParquetRowConverter(parquetType.asGroupType(), t, new ParentContainerUpdater {
           override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
@@ -186,7 +186,7 @@ private[parquet] class CatalystRowConverter(
         val catalystTypeForUDT = t.sqlType
         val nullable = parquetType.isRepetition(Repetition.OPTIONAL)
         val field = StructField("udt", catalystTypeForUDT, nullable)
-        val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field)
+        val parquetTypeForUDT = new ParquetSchemaConverter().convertField(field)
         newConverter(parquetTypeForUDT, catalystTypeForUDT, updater)
 
       case _ =>
@@ -200,7 +200,7 @@ private[parquet] class CatalystRowConverter(
    * are handled by this converter.  Parquet primitive types are only a subset of those of Spark
    * SQL.  For example, BYTE, SHORT, and INT in Spark SQL are all covered by INT32 in Parquet.
    */
-  private final class CatalystPrimitiveConverter(updater: ParentContainerUpdater)
+  private final class ParquetPrimitiveConverter(updater: ParentContainerUpdater)
     extends PrimitiveConverter {
 
     override def addBoolean(value: Boolean): Unit = updater.setBoolean(value)
@@ -214,7 +214,7 @@ private[parquet] class CatalystRowConverter(
   /**
    * Parquet converter for strings. A dictionary is used to minimize string decoding cost.
    */
-  private final class CatalystStringConverter(updater: ParentContainerUpdater)
+  private final class ParquetStringConverter(updater: ParentContainerUpdater)
     extends PrimitiveConverter {
 
     private var expandedDictionary: Array[UTF8String] = null
@@ -239,7 +239,7 @@ private[parquet] class CatalystRowConverter(
   /**
    * Parquet converter for fixed-precision decimals.
    */
-  private final class CatalystDecimalConverter(
+  private final class ParquetDecimalConverter(
       decimalType: DecimalType,
       updater: ParentContainerUpdater)
     extends PrimitiveConverter {
@@ -278,7 +278,7 @@ private[parquet] class CatalystRowConverter(
         (unscaled << (64 - bits)) >> (64 - bits)
       }
 
-      if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
+      if (precision <= ParquetSchemaConverter.MAX_PRECISION_FOR_INT64) {
         // Constructs a `Decimal` with an unscaled `Long` value if possible.
         val unscaled = bytesToUnscaledLong(bytes)
         Decimal(unscaled, precision, scale)
@@ -308,7 +308,7 @@ private[parquet] class CatalystRowConverter(
    *
    * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
    */
-  private final class CatalystArrayConverter(
+  private final class ParquetArrayConverter(
       parquetSchema: GroupType,
       catalystSchema: ArrayType,
       updater: ParentContainerUpdater)
@@ -385,7 +385,7 @@ private[parquet] class CatalystRowConverter(
   }
 
   /** Parquet converter for maps */
-  private final class CatalystMapConverter(
+  private final class ParquetMapConverter(
       parquetType: GroupType,
       catalystType: MapType,
       updater: ParentContainerUpdater)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
similarity index 97%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
index acfa1b1ab6443..5c5c1f00f75b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
@@ -25,7 +25,7 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 import org.apache.parquet.schema.Type.Repetition._
 import org.apache.parquet.schema._
 
-import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision}
+import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, maxPrecisionForBytes, minBytesForPrecision}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLConf}
 
@@ -54,7 +54,7 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  *        backwards-compatible with these settings.  If this argument is set to `false`, we fallback
  *        to old style non-standard behaviors.
  */
-private[parquet] class CatalystSchemaConverter(
+private[parquet] class ParquetSchemaConverter(
     private val assumeBinaryIsString: Boolean,
     private val assumeInt96IsTimestamp: Boolean,
     private val followParquetFormatSpec: Boolean) {
@@ -136,7 +136,7 @@ private[parquet] class CatalystSchemaConverter(
       val precision = field.getDecimalMetadata.getPrecision
       val scale = field.getDecimalMetadata.getScale
 
-      CatalystSchemaConverter.analysisRequire(
+      ParquetSchemaConverter.analysisRequire(
         maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
         s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)")
 
@@ -170,7 +170,7 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case INT96 =>
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           assumeInt96IsTimestamp,
           "INT96 is not supported unless it's interpreted as timestamp. " +
             s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
@@ -212,11 +212,11 @@ private[parquet] class CatalystSchemaConverter(
       //
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
       case LIST =>
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           field.getFieldCount == 1, s"Invalid list type $field")
 
         val repeatedType = field.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           repeatedType.isRepetition(REPEATED), s"Invalid list type $field")
 
         if (isElementType(repeatedType, field.getName)) {
@@ -232,17 +232,17 @@ private[parquet] class CatalystSchemaConverter(
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
       // scalastyle:on
       case MAP | MAP_KEY_VALUE =>
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           field.getFieldCount == 1 && !field.getType(0).isPrimitive,
           s"Invalid map type: $field")
 
         val keyValueType = field.getType(0).asGroupType()
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2,
           s"Invalid map type: $field")
 
         val keyType = keyValueType.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.analysisRequire(
           keyType.isPrimitive,
           s"Map key type is expected to be a primitive type, but found: $keyType")
 
@@ -325,7 +325,7 @@ private[parquet] class CatalystSchemaConverter(
   }
 
   private def convertField(field: StructField, repetition: Type.Repetition): Type = {
-    CatalystSchemaConverter.checkFieldName(field.name)
+    ParquetSchemaConverter.checkFieldName(field.name)
 
     field.dataType match {
       // ===================
@@ -540,7 +540,7 @@ private[parquet] class CatalystSchemaConverter(
 }
 
 
-private[parquet] object CatalystSchemaConverter {
+private[parquet] object ParquetSchemaConverter {
   def checkFieldName(name: String): Unit = {
     // ,;{}()\n\t= and space are special characters in Parquet schema
     analysisRequire(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
similarity index 92%
rename from sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
index 553d05e8c6b76..49b35998ca53e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.SQLConf
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificMutableRow}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
+import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
 import org.apache.spark.sql.types._
 
-private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
+private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
   // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer.
   // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access
   // data in `ArrayData` without the help of `SpecificMutableRow`.
@@ -62,7 +62,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION))
 
   override def init(configuration: Configuration): WriteContext = {
-    val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
+    val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA)
     schema = StructType.fromString(schemaString)
     rootFieldWriters = schema.map(_.dataType).map(makeWriter)
 
@@ -72,8 +72,8 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
         SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
         SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
 
-    val messageType = new CatalystSchemaConverter(configuration).convert(schema)
-    val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
+    val messageType = new ParquetSchemaConverter(configuration).convert(schema)
+    val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
 
     logDebug(
       s"""Initialized Parquet WriteSupport with Catalyst schema:
@@ -261,13 +261,15 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     val elementWriter = makeWriter(elementType)
 
     (row: SpecializedGetters, ordinal: Int) => {
+      val array = row.getArray(ordinal)
       consumeGroup {
-        val array = row.getArray(ordinal)
+        // Only creates the repeated field if the array is non-empty.
         if (array.numElements() > 0) {
           consumeField(repeatedGroupName, 0) {
             var i = 0
             while (i < array.numElements()) {
               consumeGroup {
+                // Only creates the element field if the current array element is not null.
                 if (!array.isNullAt(i)) {
                   consumeField(elementFieldName, 0)(elementWriter.apply(array, i))
                 }
@@ -285,8 +287,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     val elementWriter = makeWriter(elementType)
 
     (row: SpecializedGetters, ordinal: Int) => {
+      val array = row.getArray(ordinal)
       consumeGroup {
-        val array = row.getArray(ordinal)
+        // Only creates the repeated field if the array is non-empty.
         if (array.numElements() > 0) {
           consumeField(repeatedFieldName, 0) {
             var i = 0
@@ -308,8 +311,9 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
     val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
 
     (row: SpecializedGetters, ordinal: Int) => {
+      val map = row.getMap(ordinal)
       consumeGroup {
-        val map = row.getMap(ordinal)
+        // Only creates the repeated field if the map is non-empty.
         if (map.numElements() > 0) {
           consumeField(repeatedGroupName, 0) {
             var i = 0
@@ -317,6 +321,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
               consumeGroup {
                 mutableRow.update(0, map.keyArray().get(i, keyType))
                 consumeField("key", 0)(keyWriter.apply(mutableRow, 0))
+                // Only creates the "value" field if the value if non-empty
                 if (!map.valueArray().isNullAt(i)) {
                   mutableRow.update(1, map.valueArray().get(i, valueType))
                   consumeField("value", 1)(valueWriter.apply(mutableRow, 1))
@@ -349,11 +354,11 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   }
 }
 
-private[parquet] object CatalystWriteSupport {
+private[parquet] object ParquetWriteSupport {
   val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
 
   def setSchema(schema: StructType, configuration: Configuration): Unit = {
-    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
+    schema.map(_.name).foreach(ParquetSchemaConverter.checkFieldName)
     configuration.set(SPARK_ROW_SCHEMA, schema.json)
     configuration.set(
       ParquetOutputFormat.WRITER_VERSION,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 24ab233d22cb9..f340277a90873 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -282,7 +282,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val expectedSchema = new CatalystSchemaConverter(configuration).convert(schema)
+      val expectedSchema = new ParquetSchemaConverter(configuration).convert(schema)
       val actualSchema = readFooter(path, configuration).getFileMetaData.getSchema
 
       actualSchema.checkContains(expectedSchema)
@@ -346,7 +346,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
       """.stripMargin)
 
     withTempPath { location =>
-      val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString)
+      val extraMetadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> sparkSchema.toString)
       val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark")
       val path = new Path(location.getCanonicalPath)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index ec6e37cd5ee85..2101fba4ff07d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -58,7 +58,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       int96AsTimestamp: Boolean = true,
       followParquetFormatSpec: Boolean = false,
       isThriftDerived: Boolean = false): Unit = {
-    val converter = new CatalystSchemaConverter(
+    val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
       followParquetFormatSpec = followParquetFormatSpec)
@@ -83,7 +83,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       int96AsTimestamp: Boolean = true,
       followParquetFormatSpec: Boolean = false,
       isThriftDerived: Boolean = false): Unit = {
-    val converter = new CatalystSchemaConverter(
+    val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
       followParquetFormatSpec = followParquetFormatSpec)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index f5dc9051a2cc2..58cf28794143a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -107,8 +107,8 @@ private[sql] trait ParquetTest extends SQLTestUtils { this: SparkFunSuite =>
   }
 
   def writeMetadata(schema: StructType, path: Path, configuration: Configuration): Unit = {
-    val parquetSchema = new CatalystSchemaConverter(configuration).convert(schema)
-    val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava
+    val parquetSchema = new ParquetSchemaConverter(configuration).convert(schema)
+    val extraMetadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schema.json).asJava
     val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}"
     val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy)
     val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava)

From 0395e9505942647ce8e4e282223ae7fcdde872a3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 15:53:20 +0800
Subject: [PATCH 17/21] Renames root Parquet message name

---
 .../org/apache/spark/sql/parquet/ParquetReadSupport.scala  | 6 ++++--
 .../apache/spark/sql/parquet/ParquetSchemaConverter.scala  | 7 ++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
index b67d15264c415..46418a4a8dcdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetReadSupport.scala
@@ -121,7 +121,9 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with
             if (fileFieldNames.contains(field.name)) {
               // If the field exists in the target Parquet file, extracts the field type from the
               // full file schema and makes a single-field Parquet schema
-              new MessageType("root", fileSchema.getType(field.name))
+              new MessageType(
+                ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME,
+                fileSchema.getType(field.name))
             } else {
               // Otherwise, just resorts to `CatalystSchemaConverter`
               toParquet.convert(StructType(Array(field)))
@@ -131,7 +133,7 @@ private[parquet] class ParquetReadSupport extends ReadSupport[InternalRow] with
           // columns.  Note that it's possible that no columns are requested at all (e.g., count
           // some partition column of a partitioned Parquet table). That's why `fold` is used here
           // and always fallback to an empty Parquet schema.
-          .fold(new MessageType("root")) {
+          .fold(new MessageType(ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)) {
             _ union _
           }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
index 5c5c1f00f75b4..7495fce40e241 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
@@ -314,7 +314,10 @@ private[parquet] class ParquetSchemaConverter(
    * Converts a Spark SQL [[StructType]] to a Parquet [[MessageType]].
    */
   def convert(catalystSchema: StructType): MessageType = {
-    Types.buildMessage().addFields(catalystSchema.map(convertField): _*).named("root")
+    Types
+      .buildMessage()
+      .addFields(catalystSchema.map(convertField): _*)
+      .named(ParquetSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)
   }
 
   /**
@@ -541,6 +544,8 @@ private[parquet] class ParquetSchemaConverter(
 
 
 private[parquet] object ParquetSchemaConverter {
+  val SPARK_PARQUET_SCHEMA_NAME = "spark_schema"
+
   def checkFieldName(name: String): Unit = {
     // ,;{}()\n\t= and space are special characters in Parquet schema
     analysisRequire(

From 23e523d26e91d58be297a898694e359b584b7c6f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 16:06:26 +0800
Subject: [PATCH 18/21] Makes implicit arguments in ParquetSchemaSuite explicit

---
 .../sql/parquet/ParquetSchemaSuite.scala      | 257 +++++++++++++-----
 1 file changed, 185 insertions(+), 72 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 2101fba4ff07d..84c60f6a9a039 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -36,28 +36,25 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
   protected def testSchemaInference[T <: Product: ClassTag: TypeTag](
       testName: String,
       messageType: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean): Unit = {
     testSchema(
       testName,
       StructType.fromAttributes(ScalaReflection.attributesFor[T]),
       messageType,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      followParquetFormatSpec)
   }
 
   protected def testParquetToCatalyst(
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean): Unit = {
     val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
@@ -79,10 +76,9 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean): Unit = {
     val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
@@ -100,10 +96,9 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       testName: String,
       sqlSchema: StructType,
       parquetSchema: String,
-      binaryAsString: Boolean = true,
-      int96AsTimestamp: Boolean = true,
-      followParquetFormatSpec: Boolean = false,
-      isThriftDerived: Boolean = false): Unit = {
+      binaryAsString: Boolean,
+      int96AsTimestamp: Boolean,
+      followParquetFormatSpec: Boolean): Unit = {
 
     testCatalystToParquet(
       testName,
@@ -111,8 +106,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      followParquetFormatSpec)
 
     testParquetToCatalyst(
       testName,
@@ -120,8 +114,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec,
-      isThriftDerived)
+      followParquetFormatSpec)
   }
 }
 
@@ -138,7 +131,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  optional binary  _6;
       |}
     """.stripMargin,
-    binaryAsString = false)
+    binaryAsString = false,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)](
     "logical integral types",
@@ -150,7 +145,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  required int64 _4 (INT_64);
       |  optional int32 _5 (DATE);
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[String]](
     "string",
@@ -159,7 +157,9 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  optional binary _1 (UTF8);
       |}
     """.stripMargin,
-    binaryAsString = true)
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[String]](
     "binary enum as string",
@@ -167,7 +167,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |message root {
       |  optional binary _1 (ENUM);
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - non-standard",
@@ -177,7 +180,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    repeated int32 array;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - standard",
@@ -190,6 +196,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
@@ -202,7 +210,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
     "nullable array - standard",
@@ -215,6 +226,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
@@ -229,6 +242,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
@@ -242,7 +257,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[Pair[Int, String]]](
     "struct",
@@ -254,6 +272,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
@@ -277,7 +297,10 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
     "deeply nested type - standard",
@@ -301,6 +324,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchemaInference[(Option[Int], Map[Int, Option[Double]])](
@@ -316,6 +341,8 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   // Parquet files generated by parquet-thrift are already handled by the schema converter, but
@@ -325,26 +352,28 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     // as expected from attributes
     testSchemaInference[(
       Array[Byte], Array[Byte], Array[Byte], Seq[Int], Map[Array[Byte], Seq[Int]])](
-      "thrift generated parquet schema",
-      """
-        |message root {
-        |  optional binary _1 (UTF8);
-        |  optional binary _2 (UTF8);
-        |  optional binary _3 (UTF8);
-        |  optional group _4 (LIST) {
-        |    repeated int32 _4_tuple;
-        |  }
-        |  optional group _5 (MAP) {
-        |    repeated group map (MAP_KEY_VALUE) {
-        |      required binary key (UTF8);
-        |      optional group value (LIST) {
-        |        repeated int32 value_tuple;
-        |      }
-        |    }
-        |  }
-        |}
-      """.stripMargin,
-      isThriftDerived = true)
+        "thrift generated parquet schema",
+        """
+          |message root {
+          |  optional binary _1 (UTF8);
+          |  optional binary _2 (UTF8);
+          |  optional binary _3 (UTF8);
+          |  optional group _4 (LIST) {
+          |    repeated int32 _4_tuple;
+          |  }
+          |  optional group _5 (MAP) {
+          |    repeated group map (MAP_KEY_VALUE) {
+          |      required binary key (UTF8);
+          |      optional group value (LIST) {
+          |        repeated int32 value_tuple;
+          |      }
+          |    }
+          |  }
+          |}
+        """.stripMargin,
+        binaryAsString = true,
+        int96AsTimestamp = true,
+        followParquetFormatSpec = false)
   }
 }
 
@@ -471,7 +500,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with nullable element type - 2",
@@ -487,7 +519,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -500,7 +535,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 2",
@@ -513,7 +551,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 3",
@@ -524,7 +565,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    repeated int32 element;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 4",
@@ -545,7 +589,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style",
@@ -564,7 +611,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style",
@@ -583,7 +633,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   // =======================================================
   // Tests for converting Catalyst ArrayType to Parquet LIST
@@ -604,6 +657,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testCatalystToParquet(
@@ -620,7 +675,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -637,6 +695,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testCatalystToParquet(
@@ -651,7 +711,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    repeated int32 array;
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   // ====================================================
   // Tests for converting Parquet Map to Catalyst MapType
@@ -672,7 +735,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 2",
@@ -689,7 +755,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x",
@@ -706,7 +775,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -723,7 +795,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 2",
@@ -740,7 +815,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style",
@@ -757,7 +835,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   // ====================================================
   // Tests for converting Catalyst MapType to Parquet Map
@@ -779,6 +860,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testCatalystToParquet(
@@ -796,7 +879,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -814,6 +900,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  }
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testCatalystToParquet(
@@ -831,7 +919,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |    }
       |  }
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   // =================================
   // Tests for conversion for decimals
@@ -844,6 +935,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(1, 0));
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchema(
@@ -853,6 +946,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(8, 3));
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchema(
@@ -862,6 +957,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int32 f1 (DECIMAL(9, 3));
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchema(
@@ -871,6 +968,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional int64 f1 (DECIMAL(18, 3));
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchema(
@@ -880,6 +979,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
       |  optional fixed_len_byte_array(9) f1 (DECIMAL(19, 3));
       |}
     """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
     followParquetFormatSpec = true)
 
   testSchema(
@@ -888,7 +989,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchema(
     "DECIMAL(8, 3) - prior to 1.4.x",
@@ -896,7 +1000,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchema(
     "DECIMAL(9, 3) - prior to 1.4.x",
@@ -904,7 +1011,10 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 
   testSchema(
     "DECIMAL(18, 3) - prior to 1.4.x",
@@ -912,5 +1022,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """message root {
       |  optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));
       |}
-    """.stripMargin)
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = true,
+    followParquetFormatSpec = false)
 }

From 66419b7e019d94c41b6439287df0cf5a42cb4e15 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 17:08:05 +0800
Subject: [PATCH 19/21] Renames followParquetFormatSpec to
 writeLegacyParquetFormat (its meaning is flipped)

---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  12 +-
 .../spark/sql/parquet/ParquetRelation.scala   |  10 +-
 .../sql/parquet/ParquetSchemaConverter.scala  |  66 +++----
 .../sql/parquet/ParquetWriteSupport.scala     | 185 ++++++++++++------
 .../sql/parquet/ParquetSchemaSuite.scala      | 110 +++++------
 .../apache/spark/sql/hive/parquetSuites.scala |   2 +-
 6 files changed, 220 insertions(+), 165 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 6644e85d4a037..290ce472c6d96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -283,12 +283,12 @@ private[spark] object SQLConf {
     defaultValue = Some(true),
     doc = "Enables Parquet filter push-down optimization when set to true.")
 
-  val PARQUET_FOLLOW_PARQUET_FORMAT_SPEC = booleanConf(
-    key = "spark.sql.parquet.followParquetFormatSpec",
+  val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
+    key = "spark.sql.parquet.writeLegacyParquetFormat",
     defaultValue = Some(false),
-    doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.",
-    isPublic = false)
+    doc = "When true, writes Parquet data in legacy format compatible with Spark 1.4.0 and prior " +
+      "versions, instead of the standard one defined in parquet-format spec.",
+    isPublic = true)
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
     key = "spark.sql.parquet.output.committer.class",
@@ -493,7 +493,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
-  private[spark] def followParquetFormatSpec: Boolean = getConf(PARQUET_FOLLOW_PARQUET_FORMAT_SPEC)
+  private[spark] def writeLegacyParquetFormat: Boolean = getConf(PARQUET_WRITE_LEGACY_FORMAT)
 
   private[spark] def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
index f2e0f78da4433..af54851346992 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -233,8 +233,8 @@ private[sql] class ParquetRelation(
 
     // Sets flag for Parquet schema converter (converting Catalyst schema to Parquet schema)
     conf.set(
-      SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
-      sqlContext.conf.followParquetFormatSpec.toString)
+      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+      sqlContext.conf.writeLegacyParquetFormat.toString)
 
     // Sets compression scheme
     conf.set(
@@ -521,7 +521,7 @@ private[sql] object ParquetRelation extends Logging {
       val converter = new ParquetSchemaConverter(
         sqlContext.conf.isParquetBinaryAsString,
         sqlContext.conf.isParquetBinaryAsString,
-        sqlContext.conf.followParquetFormatSpec)
+        sqlContext.conf.writeLegacyParquetFormat)
 
       converter.convert(schema)
     }
@@ -655,7 +655,7 @@ private[sql] object ParquetRelation extends Logging {
       filesToTouch: Seq[FileStatus], sqlContext: SQLContext): Option[StructType] = {
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val followParquetFormatSpec = sqlContext.conf.followParquetFormatSpec
+    val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat
     val serializedConf = new SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration)
 
     // HACK ALERT:
@@ -695,7 +695,7 @@ private[sql] object ParquetRelation extends Logging {
             new ParquetSchemaConverter(
               assumeBinaryIsString = assumeBinaryIsString,
               assumeInt96IsTimestamp = assumeInt96IsTimestamp,
-              followParquetFormatSpec = followParquetFormatSpec)
+              writeLegacyParquetFormat = writeLegacyParquetFormat)
 
           footers.map { footer =>
             ParquetRelation.readSchemaFromFooter(footer, converter)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
index 7495fce40e241..80656b04ed7ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
@@ -41,35 +41,33 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  * @constructor
  * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL
  *        [[StringType]] fields when converting Parquet a [[MessageType]] to Spark SQL
- *        [[StructType]].
+ *        [[StructType]].  This argument only affects Parquet read path.
  * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL
  *        [[TimestampType]] fields when converting Parquet a [[MessageType]] to Spark SQL
  *        [[StructType]].  Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which
  *        has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS`
- *        described in Parquet format spec.
- * @param followParquetFormatSpec Whether to generate standard DECIMAL, LIST, and MAP structure when
- *        converting Spark SQL [[StructType]] to Parquet [[MessageType]].  For Spark 1.4.x and
- *        prior versions, Spark SQL only supports decimals with a max precision of 18 digits, and
- *        uses non-standard LIST and MAP structure.  Note that the current Parquet format spec is
- *        backwards-compatible with these settings.  If this argument is set to `false`, we fallback
- *        to old style non-standard behaviors.
+ *        described in Parquet format spec.  This argument only affects Parquet read path.
+ * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
+ *        and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]].
+ *        When set to false, use standard format defined in parquet-format spec.  This argument only
+ *        affects Parquet write path.
  */
 private[parquet] class ParquetSchemaConverter(
     private val assumeBinaryIsString: Boolean,
     private val assumeInt96IsTimestamp: Boolean,
-    private val followParquetFormatSpec: Boolean) {
+    private val writeLegacyParquetFormat: Boolean) {
 
   // Only used when constructing converter for converting Spark SQL schema to Parquet schema, in
   // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are irrelevant.
   def this() = this(
     assumeBinaryIsString = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
     assumeInt96IsTimestamp = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
-    followParquetFormatSpec = SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
+    writeLegacyParquetFormat = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get)
 
   def this(conf: SQLConf) = this(
     assumeBinaryIsString = conf.isParquetBinaryAsString,
     assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp,
-    followParquetFormatSpec = conf.followParquetFormatSpec)
+    writeLegacyParquetFormat = conf.writeLegacyParquetFormat)
 
   def this(conf: Configuration) = this(
     assumeBinaryIsString =
@@ -80,10 +78,10 @@ private[parquet] class ParquetSchemaConverter(
       conf.getBoolean(
         SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
         SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get),
-    followParquetFormatSpec =
+    writeLegacyParquetFormat =
       conf.getBoolean(
-        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
-        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get))
+        SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+        SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get))
 
   /**
    * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]].
@@ -386,15 +384,15 @@ private[parquet] class ParquetSchemaConverter(
       case BinaryType =>
         Types.primitive(BINARY, repetition).named(field.name)
 
-      // =====================================
-      // Decimals (for Spark version <= 1.4.x)
-      // =====================================
+      // ======================
+      // Decimals (legacy mode)
+      // ======================
 
       // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
       // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
       // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
       // by `DECIMAL`.
-      case DecimalType.Fixed(precision, scale) if !followParquetFormatSpec =>
+      case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
@@ -403,13 +401,13 @@ private[parquet] class ParquetSchemaConverter(
           .length(minBytesForPrecision(precision))
           .named(field.name)
 
-      // =====================================
-      // Decimals (follow Parquet format spec)
-      // =====================================
+      // ========================
+      // Decimals (standard mode)
+      // ========================
 
       // Uses INT32 for 1 <= precision <= 9
       case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat =>
         Types
           .primitive(INT32, repetition)
           .as(DECIMAL)
@@ -419,7 +417,7 @@ private[parquet] class ParquetSchemaConverter(
 
       // Uses INT64 for 10 <= precision <= 18
       case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
+          if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat =>
         Types
           .primitive(INT64, repetition)
           .as(DECIMAL)
@@ -428,7 +426,7 @@ private[parquet] class ParquetSchemaConverter(
           .named(field.name)
 
       // Uses FIXED_LEN_BYTE_ARRAY for all other precisions
-      case DecimalType.Fixed(precision, scale) if followParquetFormatSpec =>
+      case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat =>
         Types
           .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
           .as(DECIMAL)
@@ -437,16 +435,16 @@ private[parquet] class ParquetSchemaConverter(
           .length(minBytesForPrecision(precision))
           .named(field.name)
 
-      // ===================================================
-      // ArrayType and MapType (for Spark versions <= 1.4.x)
-      // ===================================================
+      // ===================================
+      // ArrayType and MapType (legacy mode)
+      // ===================================
 
       // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
       // LIST structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
       // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous
       // field name "array" is from parquet-avro. Note that this case is covered by the backwards-
       // compatibility rules implemented in `isElementType()`.
-      case ArrayType(elementType, nullable @ true) if !followParquetFormatSpec =>
+      case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   optional group bag {
         //     repeated <element-type> array;
@@ -464,7 +462,7 @@ private[parquet] class ParquetSchemaConverter(
       // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
       // covered by the backwards-compatibility rules implemented in `isElementType()`.
-      case ArrayType(elementType, nullable @ false) if !followParquetFormatSpec =>
+      case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   repeated <element-type> array;
         // }
@@ -476,7 +474,7 @@ private[parquet] class ParquetSchemaConverter(
 
       // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
-      case MapType(keyType, valueType, valueContainsNull) if !followParquetFormatSpec =>
+      case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
         // <map-repetition> group <name> (MAP) {
         //   repeated group map (MAP_KEY_VALUE) {
         //     required <key-type> key;
@@ -489,11 +487,11 @@ private[parquet] class ParquetSchemaConverter(
           convertField(StructField("key", keyType, nullable = false)),
           convertField(StructField("value", valueType, valueContainsNull)))
 
-      // ==================================================
-      // ArrayType and MapType (follow Parquet format spec)
-      // ==================================================
+      // =====================================
+      // ArrayType and MapType (standard mode)
+      // =====================================
 
-      case ArrayType(elementType, containsNull) if followParquetFormatSpec =>
+      case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat =>
         // <list-repetition> group <name> (LIST) {
         //   repeated group list {
         //     <element-repetition> <element-type> element;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
index 49b35998ca53e..0e1c17a37e650 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
@@ -37,6 +37,17 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.parquet.ParquetSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
 import org.apache.spark.sql.types._
 
+/**
+ * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet
+ * messages.  This class can write Parquet data in two modes:
+ *
+ *  - Standard mode: Parquet data are written in standard format defined in parquet-format spec.
+ *  - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior.
+ *
+ * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyParquetFormat`.  The
+ * value of the option is propagated to this class by the `init()` method and its Hadoop
+ * configuration argument.
+ */
 private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
   // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer.
   // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access
@@ -52,8 +63,8 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
   // The Parquet `RecordConsumer` to which all `InternalRow`s are written
   private var recordConsumer: RecordConsumer = _
 
-  // Whether we should write standard Parquet data conforming to parquet-format spec or not
-  private var followParquetFormatSpec: Boolean = _
+  // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions
+  private var writeLegacyParquetFormat: Boolean = _
 
   // Reusable byte array used to write timestamps as Parquet INT96 values
   private val timestampBuffer = new Array[Byte](12)
@@ -65,12 +76,10 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
     val schemaString = configuration.get(ParquetWriteSupport.SPARK_ROW_SCHEMA)
     schema = StructType.fromString(schemaString)
     rootFieldWriters = schema.map(_.dataType).map(makeWriter)
-
-    assert(configuration.get(SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key) != null)
-    followParquetFormatSpec =
-      configuration.getBoolean(
-        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
-        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
+    writeLegacyParquetFormat = {
+      assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null)
+      configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean
+    }
 
     val messageType = new ParquetSchemaConverter(configuration).convert(schema)
     val metadata = Map(ParquetReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
@@ -141,9 +150,15 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
           recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
 
       case TimestampType =>
-        // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
         (row: SpecializedGetters, ordinal: Int) => {
-          // Actually Spark SQL `TimestampType` only has microsecond precision.
+          // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
+          // Currently we only support timestamps stored as INT96, which is compatible with Hive
+          // and Impala.  However, INT96 is to be deprecated.  We plan to support `TIMESTAMP_MICROS`
+          // defined in the parquet-format spec.  But up until writing, the most recent parquet-mr
+          // version (1.8.1) hasn't implemented it yet.
+
+          // NOTE: Starting from Spark 1.5, Spark SQL `TimestampType` only has microsecond
+          // precision.  Nanosecond parts of timestamp values read from INT96 are simply stripped.
           val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
           val buf = ByteBuffer.wrap(timestampBuffer)
           buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
@@ -162,26 +177,14 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
         (row: SpecializedGetters, ordinal: Int) =>
           consumeGroup(writeFields(row.getStruct(ordinal, t.length), t, fieldWriters))
 
-      case ArrayType(elementType, _) if followParquetFormatSpec =>
-        makeThreeLevelArrayWriter(elementType, "list", "element")
-
-      case ArrayType(elementType, true) if !followParquetFormatSpec =>
-        makeThreeLevelArrayWriter(elementType, "bag", "array")
-
-      case ArrayType(elementType, false) if !followParquetFormatSpec =>
-        makeTwoLevelArrayWriter(elementType, "array")
+      case t: ArrayType => makeArrayWriter(t)
 
-      case t: MapType if followParquetFormatSpec =>
-        makeMapWriter(t, "key_value")
+      case t: MapType => makeMapWriter(t)
 
-      case t: MapType if !followParquetFormatSpec =>
-        makeMapWriter(t, "map")
+      case t: UserDefinedType[_] => makeWriter(t.sqlType)
 
-      case t: UserDefinedType[_] =>
-        makeWriter(t.sqlType)
-
-      case _ =>
-        sys.error(s"Unsupported data type $dataType.")
+      // TODO Adds IntervalType support
+      case _ => sys.error(s"Unsupported data type $dataType.")
     }
   }
 
@@ -239,15 +242,15 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
         recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes))
       }
 
-    followParquetFormatSpec match {
+    writeLegacyParquetFormat match {
       // Standard mode, writes decimals with precision <= 9 as INT32
-      case true if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
+      case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
 
       // Standard mode, writes decimals with precision <= 18 as INT64
-      case true if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
+      case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
 
       // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY
-      case false if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
+      case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
 
       // All other cases:
       //  - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
@@ -256,59 +259,113 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
     }
   }
 
-  private def makeThreeLevelArrayWriter(
-      elementType: DataType, repeatedGroupName: String, elementFieldName: String): ValueWriter = {
-    val elementWriter = makeWriter(elementType)
+  def makeArrayWriter(arrayType: ArrayType): ValueWriter = {
+    val elementWriter = makeWriter(arrayType.elementType)
 
-    (row: SpecializedGetters, ordinal: Int) => {
-      val array = row.getArray(ordinal)
-      consumeGroup {
-        // Only creates the repeated field if the array is non-empty.
-        if (array.numElements() > 0) {
-          consumeField(repeatedGroupName, 0) {
-            var i = 0
-            while (i < array.numElements()) {
-              consumeGroup {
-                // Only creates the element field if the current array element is not null.
-                if (!array.isNullAt(i)) {
-                  consumeField(elementFieldName, 0)(elementWriter.apply(array, i))
+    def threeLevelArrayWriter(repeatedGroupName: String, elementFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedGroupName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                consumeGroup {
+                  // Only creates the element field if the current array element is not null.
+                  if (!array.isNullAt(i)) {
+                    consumeField(elementFieldName, 0)(elementWriter.apply(array, i))
+                  }
                 }
+                i += 1
               }
-              i += 1
             }
           }
         }
       }
-    }
-  }
 
-  private def makeTwoLevelArrayWriter(
-      elementType: DataType, repeatedFieldName: String): ValueWriter = {
-    val elementWriter = makeWriter(elementType)
-
-    (row: SpecializedGetters, ordinal: Int) => {
-      val array = row.getArray(ordinal)
-      consumeGroup {
-        // Only creates the repeated field if the array is non-empty.
-        if (array.numElements() > 0) {
-          consumeField(repeatedFieldName, 0) {
-            var i = 0
-            while (i < array.numElements()) {
-              elementWriter.apply(array, i)
-              i += 1
+    def twoLevelArrayWriter(repeatedFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedFieldName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                elementWriter.apply(array, i)
+                i += 1
+              }
             }
           }
         }
       }
+
+    (writeLegacyParquetFormat, arrayType.containsNull) match {
+      case (false, _) =>
+        // Standard mode:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated group list {
+        //                    ^~~~  repeatedGroupName
+        //       <element-repetition> <element-type> element;
+        //                                           ^~~~~~~  elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "list", elementFieldName = "element")
+
+      case (true, true) =>
+        // Legacy mode, with nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     optional group bag {
+        //                    ^~~  repeatedGroupName
+        //       repeated <element-type> array;
+        //                               ^~~~~ elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "bag", elementFieldName = "array")
+
+      case (true, false) =>
+        // Legacy mode, with non-nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated <element-type> array;
+        //                             ^~~~~  repeatedFieldName
+        //   }
+        twoLevelArrayWriter(repeatedFieldName = "array")
     }
   }
 
-  private def makeMapWriter(mapType: MapType, repeatedGroupName: String): ValueWriter = {
+  private def makeMapWriter(mapType: MapType): ValueWriter = {
     val keyType = mapType.keyType
     val valueType = mapType.valueType
     val keyWriter = makeWriter(keyType)
     val valueWriter = makeWriter(valueType)
     val mutableRow = new SpecificMutableRow(keyType :: valueType :: Nil)
+    val repeatedGroupName = if (writeLegacyParquetFormat) {
+      // Legacy mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group map (MAP_KEY_VALUE) {
+      //                    ^~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "map"
+    } else {
+      // Standard mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group key_value {
+      //                    ^~~~~~~~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "key_value"
+    }
 
     (row: SpecializedGetters, ordinal: Int) => {
       val map = row.getMap(ordinal)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 84c60f6a9a039..230983b243771 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -38,14 +38,14 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       messageType: String,
       binaryAsString: Boolean,
       int96AsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean): Unit = {
+      writeLegacyParquetFormat: Boolean): Unit = {
     testSchema(
       testName,
       StructType.fromAttributes(ScalaReflection.attributesFor[T]),
       messageType,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec)
+      writeLegacyParquetFormat)
   }
 
   protected def testParquetToCatalyst(
@@ -54,11 +54,11 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema: String,
       binaryAsString: Boolean,
       int96AsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean): Unit = {
+      writeLegacyParquetFormat: Boolean): Unit = {
     val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
-      followParquetFormatSpec = followParquetFormatSpec)
+      writeLegacyParquetFormat = writeLegacyParquetFormat)
 
     test(s"sql <= parquet: $testName") {
       val actual = converter.convert(MessageTypeParser.parseMessageType(parquetSchema))
@@ -78,11 +78,11 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema: String,
       binaryAsString: Boolean,
       int96AsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean): Unit = {
+      writeLegacyParquetFormat: Boolean): Unit = {
     val converter = new ParquetSchemaConverter(
       assumeBinaryIsString = binaryAsString,
       assumeInt96IsTimestamp = int96AsTimestamp,
-      followParquetFormatSpec = followParquetFormatSpec)
+      writeLegacyParquetFormat = writeLegacyParquetFormat)
 
     test(s"sql => parquet: $testName") {
       val actual = converter.convert(sqlSchema)
@@ -98,7 +98,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema: String,
       binaryAsString: Boolean,
       int96AsTimestamp: Boolean,
-      followParquetFormatSpec: Boolean): Unit = {
+      writeLegacyParquetFormat: Boolean): Unit = {
 
     testCatalystToParquet(
       testName,
@@ -106,7 +106,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec)
+      writeLegacyParquetFormat)
 
     testParquetToCatalyst(
       testName,
@@ -114,7 +114,7 @@ abstract class ParquetSchemaTest extends SparkFunSuite with ParquetTest {
       parquetSchema,
       binaryAsString,
       int96AsTimestamp,
-      followParquetFormatSpec)
+      writeLegacyParquetFormat)
   }
 }
 
@@ -133,7 +133,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = false,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[(Byte, Short, Int, Long, java.sql.Date)](
     "logical integral types",
@@ -148,7 +148,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[String]](
     "string",
@@ -159,7 +159,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[String]](
     "binary enum as string",
@@ -170,7 +170,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - non-standard",
@@ -183,7 +183,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Int]]](
     "non-nullable array - standard",
@@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
     "nullable array - non-standard",
@@ -213,7 +213,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Seq[Integer]]](
     "nullable array - standard",
@@ -228,7 +228,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
     "map - standard",
@@ -244,7 +244,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, String]]](
     "map - non-standard",
@@ -260,7 +260,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Pair[Int, String]]](
     "struct",
@@ -274,7 +274,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
     "deeply nested type - non-standard",
@@ -300,7 +300,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchemaInference[Tuple1[Map[Int, (String, Seq[(Int, Double)])]]](
     "deeply nested type - standard",
@@ -326,7 +326,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchemaInference[(Option[Int], Map[Int, Option[Double]])](
     "optional types",
@@ -343,7 +343,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   // Parquet files generated by parquet-thrift are already handled by the schema converter, but
   // let's leave this test here until both read path and write path are all updated.
@@ -373,7 +373,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest {
         """.stripMargin,
         binaryAsString = true,
         int96AsTimestamp = true,
-        followParquetFormatSpec = false)
+        writeLegacyParquetFormat = true)
   }
 }
 
@@ -503,7 +503,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with nullable element type - 2",
@@ -522,7 +522,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -538,7 +538,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 2",
@@ -554,7 +554,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 3",
@@ -568,7 +568,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 4",
@@ -592,7 +592,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style",
@@ -614,7 +614,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style",
@@ -636,7 +636,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   // =======================================================
   // Tests for converting Catalyst ArrayType to Parquet LIST
@@ -659,7 +659,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x",
@@ -678,7 +678,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with non-nullable element type - 1 - standard",
@@ -697,7 +697,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x",
@@ -714,7 +714,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   // ====================================================
   // Tests for converting Parquet Map to Catalyst MapType
@@ -738,7 +738,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 2",
@@ -758,7 +758,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x",
@@ -778,7 +778,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -798,7 +798,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 2",
@@ -818,7 +818,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
     "Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style",
@@ -838,7 +838,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   // ====================================================
   // Tests for converting Catalyst MapType to Parquet Map
@@ -862,7 +862,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x",
@@ -882,7 +882,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with nullable value type - 1 - standard",
@@ -902,7 +902,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
     "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x",
@@ -922,7 +922,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   // =================================
   // Tests for conversion for decimals
@@ -937,7 +937,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(8, 3) - standard",
@@ -948,7 +948,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(9, 3) - standard",
@@ -959,7 +959,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(18, 3) - standard",
@@ -970,7 +970,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(19, 3) - standard",
@@ -981,7 +981,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = true)
+    writeLegacyParquetFormat = false)
 
   testSchema(
     "DECIMAL(1, 0) - prior to 1.4.x",
@@ -992,7 +992,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(8, 3) - prior to 1.4.x",
@@ -1003,7 +1003,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(9, 3) - prior to 1.4.x",
@@ -1014,7 +1014,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 
   testSchema(
     "DECIMAL(18, 3) - prior to 1.4.x",
@@ -1025,5 +1025,5 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     """.stripMargin,
     binaryAsString = true,
     int96AsTimestamp = true,
-    followParquetFormatSpec = false)
+    writeLegacyParquetFormat = true)
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index f56fb96c52d37..0cb9f048c9fcd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -608,7 +608,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
         val conf = Seq(
           HiveContext.CONVERT_METASTORE_PARQUET.key -> "false",
           SQLConf.PARQUET_BINARY_AS_STRING.key -> "true",
-          SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key -> "true")
+          SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false")
 
         withSQLConf(conf: _*) {
           sql(

From 6bda94ba32ebb61d089c5392f1afdbf5613b8549 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 2 Aug 2015 17:38:19 +0800
Subject: [PATCH 20/21] Renames analysisRequire to checkConversionRequirement

---
 .../sql/parquet/ParquetSchemaConverter.scala   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
index 80656b04ed7ba..c5bab35aa421f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
@@ -134,7 +134,7 @@ private[parquet] class ParquetSchemaConverter(
       val precision = field.getDecimalMetadata.getPrecision
       val scale = field.getDecimalMetadata.getScale
 
-      ParquetSchemaConverter.analysisRequire(
+      ParquetSchemaConverter.checkConversionRequirement(
         maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
         s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)")
 
@@ -168,7 +168,7 @@ private[parquet] class ParquetSchemaConverter(
         }
 
       case INT96 =>
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           assumeInt96IsTimestamp,
           "INT96 is not supported unless it's interpreted as timestamp. " +
             s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
@@ -210,11 +210,11 @@ private[parquet] class ParquetSchemaConverter(
       //
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
       case LIST =>
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1, s"Invalid list type $field")
 
         val repeatedType = field.getType(0)
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           repeatedType.isRepetition(REPEATED), s"Invalid list type $field")
 
         if (isElementType(repeatedType, field.getName)) {
@@ -230,17 +230,17 @@ private[parquet] class ParquetSchemaConverter(
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
       // scalastyle:on
       case MAP | MAP_KEY_VALUE =>
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1 && !field.getType(0).isPrimitive,
           s"Invalid map type: $field")
 
         val keyValueType = field.getType(0).asGroupType()
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2,
           s"Invalid map type: $field")
 
         val keyType = keyValueType.getType(0)
-        ParquetSchemaConverter.analysisRequire(
+        ParquetSchemaConverter.checkConversionRequirement(
           keyType.isPrimitive,
           s"Map key type is expected to be a primitive type, but found: $keyType")
 
@@ -546,7 +546,7 @@ private[parquet] object ParquetSchemaConverter {
 
   def checkFieldName(name: String): Unit = {
     // ,;{}()\n\t= and space are special characters in Parquet schema
-    analysisRequire(
+    checkConversionRequirement(
       !name.matches(".*[ ,;{}()\n\t=].*"),
       s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
          |Please use alias to rename it.
@@ -558,7 +558,7 @@ private[parquet] object ParquetSchemaConverter {
     schema
   }
 
-  def analysisRequire(f: => Boolean, message: String): Unit = {
+  def checkConversionRequirement(f: => Boolean, message: String): Unit = {
     if (!f) {
       throw new AnalysisException(message)
     }

From 679888afa7f4128fb9bbbb021bd5ec06467f88ee Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 3 Aug 2015 18:58:41 +0800
Subject: [PATCH 21/21] Simplifies ParquetSchemaConverter and updates outdated
 comments

---
 .../sql/parquet/ParquetSchemaConverter.scala  | 264 +++++++-----------
 .../sql/parquet/ParquetWriteSupport.scala     |  10 +-
 2 files changed, 109 insertions(+), 165 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
index c5bab35aa421f..26ca6b6cc5946 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetSchemaConverter.scala
@@ -329,10 +329,6 @@ private[parquet] class ParquetSchemaConverter(
     ParquetSchemaConverter.checkFieldName(field.name)
 
     field.dataType match {
-      // ===================
-      // Simple atomic types
-      // ===================
-
       case BooleanType =>
         Types.primitive(BOOLEAN, repetition).named(field.name)
 
@@ -363,173 +359,123 @@ private[parquet] class ParquetSchemaConverter(
       // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec.
       //
       // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
-      // timestamp in Impala for some historical reasons, it's not recommended to be used for any
-      // other types and will probably be deprecated in future Parquet format spec.  That's the
-      // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which
-      // are both logical types annotating `INT64`.
+      // timestamp in Impala for some historical reasons.  It's not recommended to be used for any
+      // other types and will probably be deprecated in some future version of parquet-format spec.
+      // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and
+      // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
       //
       // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
-      // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
-      // a timestamp into a `Long`.  This design decision is subject to change though, for example,
-      // we may resort to microsecond precision in the future.
+      // from Spark 1.5.0, we resort to microsecond timestamp type.
       //
-      // For Parquet, we plan to write all `TimestampType` value as `TIMESTAMP_MICROS`, but it's
-      // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using)
-      // hasn't implemented `TIMESTAMP_MICROS` yet.
+      // We plan to write all `TimestampType` values as `TIMESTAMP_MICROS`, but up to writing, the
+      // most recent version of parquet-mr (1.8.1) hasn't implemented `TIMESTAMP_MICROS` yet.
       //
-      // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+      // TODO Converts to `TIMESTAMP_MICROS` once parquet-mr implements that.
       case TimestampType =>
         Types.primitive(INT96, repetition).named(field.name)
 
       case BinaryType =>
         Types.primitive(BINARY, repetition).named(field.name)
 
-      // ======================
-      // Decimals (legacy mode)
-      // ======================
-
-      // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
-      // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
-      // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
-      // by `DECIMAL`.
-      case DecimalType.Fixed(precision, scale) if writeLegacyParquetFormat =>
-        Types
-          .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
-          .as(DECIMAL)
-          .precision(precision)
-          .scale(scale)
-          .length(minBytesForPrecision(precision))
-          .named(field.name)
-
-      // ========================
-      // Decimals (standard mode)
-      // ========================
-
-      // Uses INT32 for 1 <= precision <= 9
-      case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT32 && !writeLegacyParquetFormat =>
-        Types
-          .primitive(INT32, repetition)
-          .as(DECIMAL)
-          .precision(precision)
-          .scale(scale)
-          .named(field.name)
-
-      // Uses INT64 for 10 <= precision <= 18
-      case DecimalType.Fixed(precision, scale)
-          if precision <= MAX_PRECISION_FOR_INT64 && !writeLegacyParquetFormat =>
-        Types
-          .primitive(INT64, repetition)
-          .as(DECIMAL)
-          .precision(precision)
-          .scale(scale)
-          .named(field.name)
-
-      // Uses FIXED_LEN_BYTE_ARRAY for all other precisions
-      case DecimalType.Fixed(precision, scale) if !writeLegacyParquetFormat =>
-        Types
-          .primitive(FIXED_LEN_BYTE_ARRAY, repetition)
-          .as(DECIMAL)
-          .precision(precision)
-          .scale(scale)
-          .length(minBytesForPrecision(precision))
-          .named(field.name)
-
-      // ===================================
-      // ArrayType and MapType (legacy mode)
-      // ===================================
-
-      // Spark 1.4.x and prior versions convert ArrayType with nullable elements into a 3-level
-      // LIST structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
-      // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level anonymous
-      // field name "array" is from parquet-avro. Note that this case is covered by the backwards-
-      // compatibility rules implemented in `isElementType()`.
-      case ArrayType(elementType, nullable @ true) if writeLegacyParquetFormat =>
-        // <list-repetition> group <name> (LIST) {
-        //   optional group bag {
-        //     repeated <element-type> array;
-        //   }
-        // }
-        ConversionPatterns.listType(
-          repetition,
-          field.name,
-          Types
-            .buildGroup(REPEATED)
-            // "array" is the name chosen by Spark SQL 1.4.0 and prior versions
-            .addField(convertField(StructField("array", elementType, nullable)))
-            .named("bag"))
-
-      // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
-      // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
-      // covered by the backwards-compatibility rules implemented in `isElementType()`.
-      case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
-        // <list-repetition> group <name> (LIST) {
-        //   repeated <element-type> array;
-        // }
-        ConversionPatterns.listType(
-          repetition,
-          field.name,
-          // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
-          convertField(StructField("array", elementType, nullable), REPEATED))
-
-      // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
-      // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
-      case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
-        // <map-repetition> group <name> (MAP) {
-        //   repeated group map (MAP_KEY_VALUE) {
-        //     required <key-type> key;
-        //     <value-repetition> <value-type> value;
-        //   }
-        // }
-        ConversionPatterns.mapType(
-          repetition,
-          field.name,
-          convertField(StructField("key", keyType, nullable = false)),
-          convertField(StructField("value", valueType, valueContainsNull)))
-
-      // =====================================
-      // ArrayType and MapType (standard mode)
-      // =====================================
-
-      case ArrayType(elementType, containsNull) if !writeLegacyParquetFormat =>
-        // <list-repetition> group <name> (LIST) {
-        //   repeated group list {
-        //     <element-repetition> <element-type> element;
-        //   }
-        // }
-        Types
-          .buildGroup(repetition).as(LIST)
-          .addField(
-            Types.repeatedGroup()
-              .addField(convertField(StructField("element", elementType, containsNull)))
-              .named("list"))
-          .named(field.name)
-
-      case MapType(keyType, valueType, valueContainsNull) =>
-        // <map-repetition> group <name> (MAP) {
-        //   repeated group key_value {
-        //     required <key-type> key;
-        //     <value-repetition> <value-type> value;
-        //   }
-        // }
-        Types
-          .buildGroup(repetition).as(MAP)
-          .addField(
+      case DecimalType.Fixed(precision, scale) =>
+        val builder = writeLegacyParquetFormat match {
+          // Standard mode, 1 <= precision <= 9, converts to INT32 based DECIMAL
+          case false if precision <= MAX_PRECISION_FOR_INT32 =>
+            Types.primitive(INT32, repetition)
+
+          // Standard mode, 10 <= precision <= 18, converts to INT64 based DECIMAL
+          case false if precision <= MAX_PRECISION_FOR_INT64 =>
+            Types.primitive(INT64, repetition)
+
+          // All other cases:
+          //  - Standard mode, 19 <= precision <= 38, converts to FIXED_LEN_BYTE_ARRAY based DECIMAL
+          //  - Legacy mode, 1 <= precision <= 38, converts to FIXED_LEN_BYTE_ARRAY based DECIMAL
+          case _ =>
+            val numBytes = minBytesForPrecision(precision)
+            Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(numBytes)
+        }
+
+        builder.as(DECIMAL).precision(precision).scale(scale).named(field.name)
+
+      case t: ArrayType =>
+        val repeatedType = (writeLegacyParquetFormat, t.containsNull) match {
+          case (true, true) =>
+            // Legacy mode: Spark 1.4.x and prior versions convert `ArrayType` with nullable
+            // elements into a 3-level `LIST` structure.  This behavior is somewhat a hybrid of
+            // parquet-hive and parquet-avro (1.6.0rc3): the 3-level structure is similar to
+            // parquet-hive while the 3rd level anonymous field name "array" is from parquet-avro.
+            //
+            //   <list-repetition> group <name> (LIST) {
+            //     repeated group bag {                             |
+            //       optional <element-type> array;                 |-  repeatedType
+            //     }                                                |
+            //   }
             Types
               .repeatedGroup()
-              .addField(convertField(StructField("key", keyType, nullable = false)))
-              .addField(convertField(StructField("value", valueType, valueContainsNull)))
-              .named("key_value"))
-          .named(field.name)
-
-      // ===========
-      // Other types
-      // ===========
-
-      case StructType(fields) =>
-        fields.foldLeft(Types.buildGroup(repetition)) { (builder, field) =>
-          builder.addField(convertField(field))
-        }.named(field.name)
+              .addField(convertField(StructField("array", t.elementType, t.containsNull)))
+              .named("bag")
+
+          case (true, false) =>
+            // Legacy mode: Spark 1.4.x and prior versions convert `ArrayType` with non-nullable
+            // elements into a 2-level `LIST` structure.  This behavior mimics parquet-avro
+            // (1.6.0rc3).
+            //
+            //   <list-repetition> group <name> (LIST) {
+            //     repeated <element-type> array;                   <-  repeatedType
+            //   }
+            convertField(StructField("array", t.elementType, t.containsNull), REPEATED)
+
+          case (false, _) =>
+            // Standard mode:
+            //
+            //   <list-repetition> group <name> (LIST) {
+            //     repeated group list {                            |
+            //       <element-repetition> <element-type> element;   |-  repeatedType
+            //     }                                                |
+            //   }
+            Types
+              .repeatedGroup()
+              .addField(convertField(StructField("element", t.elementType, t.containsNull)))
+              .named("list")
+        }
+
+        Types.buildGroup(repetition).as(LIST).addField(repeatedType).named(field.name)
+
+      case t: MapType =>
+        val repeatedGroupBuilder =
+          Types
+            .repeatedGroup()
+            .addField(convertField(StructField("key", t.keyType, nullable = false)))
+            .addField(convertField(StructField("value", t.valueType, t.valueContainsNull)))
+
+        val repeatedGroup = if (writeLegacyParquetFormat) {
+          // Legacy mode: Spark 1.4.x and prior versions convert MapType into a 3-level group
+          // annotated by MAP_KEY_VALUE.
+          //
+          //   <map-repetition> group <name> (MAP) {
+          //     repeated group map (MAP_KEY_VALUE) {               |
+          //       required <key-type> key;                         |-  repeatedGroup
+          //       <value-repetition> <value-type> value;           |
+          //     }                                                  |
+          //   }
+          repeatedGroupBuilder.as(MAP_KEY_VALUE).named("map")
+        } else {
+          // Standard mode:
+          //
+          //   <map-repetition> group <name> (MAP) {
+          //     repeated group key_value {                         |
+          //       required <key-type> key;                         |-  repeatedGroup
+          //       <value-repetition> <value-type> value;           |
+          //     }                                                  |
+          //   }
+          repeatedGroupBuilder.named("key_value")
+        }
+
+        Types.buildGroup(repetition).as(MAP).addField(repeatedGroup).named(field.name)
+
+      case t: StructType =>
+        val parquetFields = t.fields.map(convertField)
+        Types.buildGroup(repetition).addFields(parquetFields: _*).named(field.name)
 
       case udt: UserDefinedType[_] =>
         convertField(field.copy(dataType = udt.sqlType))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
index 0e1c17a37e650..17e0aaa360fba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetWriteSupport.scala
@@ -243,18 +243,16 @@ private[parquet] class ParquetWriteSupport extends WriteSupport[InternalRow] wit
       }
 
     writeLegacyParquetFormat match {
-      // Standard mode, writes decimals with precision <= 9 as INT32
+      // Standard mode, 1 <= precision <= 9, writes as INT32
       case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
 
-      // Standard mode, writes decimals with precision <= 18 as INT64
+      // Standard mode, 10 <= precision <= 18, writes as INT64
       case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
 
-      // Legacy mode, writes decimals with precision <= 18 as FIXED_LEN_BYTE_ARRAY
+      // Legacy mode, 1 <= precision <= 18, writes as FIXED_LEN_BYTE_ARRAY
       case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
 
-      // All other cases:
-      //  - Standard mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
-      //  - Legacy mode, writes decimals with precision > 18 as FIXED_LEN_BYTE_ARRAY
+      // Either standard or legacy mode, 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY
       case _ => binaryWriterUsingUnscaledBytes
     }
   }