apache · MaxGekk · Jul 21, 2018 · Jul 21, 2018 · Jul 22, 2018 · Jul 22, 2018
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala
@@ -58,7 +58,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     val conf = spark.sparkContext.hadoopConfiguration
-    val parsedOptions = new AvroOptions(options, conf)
+    val parsedOptions = new AvroOptions(options, conf, spark.sessionState.conf)
 
     // Schema evolution is not supported yet. Here we only pick a single random sample file to
     // figure out the schema of the whole dataset.
@@ -113,16 +113,17 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    val parsedOptions = new AvroOptions(options, spark.sessionState.newHadoopConf())
+    val parsedOptions = new AvroOptions(
+      options,
+      spark.sessionState.newHadoopConf(),
+      spark.sessionState.conf)
     val outputAvroSchema = SchemaConverters.toAvroType(
       dataSchema, nullable = false, parsedOptions.recordName, parsedOptions.recordNamespace)
 
     AvroJob.setOutputKeySchema(job, outputAvroSchema)
-    val AVRO_COMPRESSION_CODEC = "spark.sql.avro.compression.codec"
-    val AVRO_DEFLATE_LEVEL = "spark.sql.avro.deflate.level"
     val COMPRESS_KEY = "mapred.output.compress"
 
-    spark.conf.get(AVRO_COMPRESSION_CODEC, "snappy") match {
+    parsedOptions.compression match {
       case "uncompressed" =>
         log.info("writing uncompressed Avro records")
         job.getConfiguration.setBoolean(COMPRESS_KEY, false)
@@ -133,8 +134,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
         job.getConfiguration.set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.SNAPPY_CODEC)
 
       case "deflate" =>
-        val deflateLevel = spark.conf.get(
-          AVRO_DEFLATE_LEVEL, Deflater.DEFAULT_COMPRESSION.toString).toInt
+        val deflateLevel = parsedOptions.compressionLevel
         log.info(s"compressing Avro output using deflate (level=$deflateLevel)")
         job.getConfiguration.setBoolean(COMPRESS_KEY, true)
         job.getConfiguration.set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC)
@@ -158,7 +158,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
 
     val broadcastedConf =
       spark.sparkContext.broadcast(new AvroFileFormat.SerializableConfiguration(hadoopConf))
-    val parsedOptions = new AvroOptions(options, hadoopConf)
+    val parsedOptions = new AvroOptions(options, hadoopConf, spark.sessionState.conf)
 
     (file: PartitionedFile) => {
       val log = LoggerFactory.getLogger(classOf[AvroFileFormat])

diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
@@ -21,16 +21,18 @@ import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for Avro Reader and Writer stored in case insensitive manner.
  */
 class AvroOptions(
     @transient val parameters: CaseInsensitiveMap[String],
-    @transient val conf: Configuration) extends Logging with Serializable {
+    @transient val conf: Configuration,
+    @transient val sqlConf: SQLConf) extends Logging with Serializable {
 
-  def this(parameters: Map[String, String], conf: Configuration) = {
-    this(CaseInsensitiveMap(parameters), conf)
+  def this(parameters: Map[String, String], conf: Configuration, sqlConf: SQLConf) = {
+    this(CaseInsensitiveMap(parameters), conf, sqlConf)
   }
 
   /**
@@ -68,4 +70,25 @@ class AvroOptions(
       .map(_.toBoolean)
       .getOrElse(!ignoreFilesWithoutExtension)
   }
+
+  /**
+   * The `compression` option allows to specify a compression codec used in write.
+   * Currently supported codecs are `uncompressed`, `snappy` and `deflate`.
+   * If the option is not set, the `snappy` compression is used by default.
+   */
+  val compression: String = parameters.get("compression").getOrElse(sqlConf.avroCompressionCodec)
+
+
+  /**
+   * Level of compression in the range of 1..9 inclusive. 1 - for fast, 9 - for best compression.
+   * If the compression level is not set for `deflate` compression, the current value of SQL
+   * config `spark.sql.avro.deflate.level` is used by default. For other compressions, the default
+   * value is `6`.
+   */
+  val compressionLevel: Int = {
+    val defaultLevel = 6
+    parameters.get("compressionLevel").map(_.toInt).getOrElse {
+      if (compression == "deflate") sqlConf.avroDeflateLevel else defaultLevel
+    }
+  }
 }
diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -355,7 +355,7 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     }
   }
 
-  test("write with compression") {
+  test("write with compression - sql configs") {
     withTempPath { dir =>
       val AVRO_COMPRESSION_CODEC = "spark.sql.avro.compression.codec"
       val AVRO_DEFLATE_LEVEL = "spark.sql.avro.deflate.level"
@@ -889,4 +889,30 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       assert(count == 8)
     }
   }
+
+  test("SPARK-24881: write with compression - avro options") {
+    withTempPath { dir =>
+      val uncompressDir = s"$dir/uncompress"
+      val deflateDir = s"$dir/deflate"
+      val snappyDir = s"$dir/snappy"
+
+      val df = spark.read.avro(testAvro)
+      df.write
+        .option("compression", "uncompressed")
+        .avro(uncompressDir)
+      df.write
+        .options(Map("compression" -> "deflate", "compressionLevel" -> "9"))
+        .avro(deflateDir)
+      df.write
+        .option("compression", "snappy")
+        .avro(snappyDir)
+
+      val uncompressSize = FileUtils.sizeOfDirectory(new File(uncompressDir))
+      val deflateSize = FileUtils.sizeOfDirectory(new File(deflateDir))
+      val snappySize = FileUtils.sizeOfDirectory(new File(snappyDir))
+
+      assert(uncompressSize > deflateSize)
+      assert(snappySize > deflateSize)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1422,6 +1422,17 @@ object SQLConf {
       "This only takes effect when spark.sql.repl.eagerEval.enabled is set to true.")
     .intConf
     .createWithDefault(20)
+
+  val AVRO_COMPRESSION_CODEC = buildConf("spark.sql.avro.compression.codec")
+    .doc("Compression codec used in writing of AVRO files.")
+    .stringConf
+    .createWithDefault("snappy")
+
+  val AVRO_DEFLATE_LEVEL = buildConf("spark.sql.avro.deflate.level")
+    .doc("Compression level for the deflate codec used in writing of AVRO files. " +
+      "Valid value must be in the range of from 1 to 9 inclusive. Default value is 6.")
+    .intConf
+    .createWithDefault(6)
 }
 
 /**
@@ -1806,6 +1817,10 @@ class SQLConf extends Serializable with Logging {
 
   def replEagerEvalTruncate: Int = getConf(SQLConf.REPL_EAGER_EVAL_TRUNCATE)
 
+  def avroCompressionCodec: String = getConf(SQLConf.AVRO_COMPRESSION_CODEC)
+
+  def avroDeflateLevel: Int = getConf(SQLConf.AVRO_DEFLATE_LEVEL)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */