Skip to content

Commit b9c1367

Browse files
fjh100456gatorsmile
authored andcommitted
[SPARK-21786][SQL] The 'spark.sql.parquet.compression.codec' and 'spark.sql.orc.compression.codec' configuration doesn't take effect on hive table writing
[SPARK-21786][SQL] The 'spark.sql.parquet.compression.codec' and 'spark.sql.orc.compression.codec' configuration doesn't take effect on hive table writing What changes were proposed in this pull request? Pass ‘spark.sql.parquet.compression.codec’ value to ‘parquet.compression’. Pass ‘spark.sql.orc.compression.codec’ value to ‘orc.compress’. How was this patch tested? Add test. Note: This is the same issue mentioned in #19218 . That branch was deleted mistakenly, so make a new pr instead. gatorsmile maropu dongjoon-hyun discipleforteen Author: fjh100456 <[email protected]> Author: Takeshi Yamamuro <[email protected]> Author: Wenchen Fan <[email protected]> Author: gatorsmile <[email protected]> Author: Yinan Li <[email protected]> Author: Marcelo Vanzin <[email protected]> Author: Juliusz Sompolski <[email protected]> Author: Felix Cheung <[email protected]> Author: jerryshao <[email protected]> Author: Li Jin <[email protected]> Author: Gera Shegalov <[email protected]> Author: chetkhatri <[email protected]> Author: Joseph K. Bradley <[email protected]> Author: Bago Amirbekian <[email protected]> Author: Xianjin YE <[email protected]> Author: Bruce Robbins <[email protected]> Author: zuotingbing <[email protected]> Author: Kent Yao <[email protected]> Author: hyukjinkwon <[email protected]> Author: Adrian Ionescu <[email protected]> Closes #20087 from fjh100456/HiveTableWriting. (cherry picked from commit 00d1691) Signed-off-by: gatorsmile <[email protected]>
1 parent e11d5ea commit b9c1367

File tree

5 files changed

+397
-6
lines changed

5 files changed

+397
-6
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,6 @@ object OrcOptions {
6767
"snappy" -> "SNAPPY",
6868
"zlib" -> "ZLIB",
6969
"lzo" -> "LZO")
70+
71+
def getORCCompressionCodecName(name: String): String = shortOrcCompressionCodecNames(name)
7072
}

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import org.apache.spark.sql.internal.SQLConf
2828
/**
2929
* Options for the Parquet data source.
3030
*/
31-
private[parquet] class ParquetOptions(
31+
class ParquetOptions(
3232
@transient private val parameters: CaseInsensitiveMap[String],
3333
@transient private val sqlConf: SQLConf)
3434
extends Serializable {
@@ -82,4 +82,8 @@ object ParquetOptions {
8282
"snappy" -> CompressionCodecName.SNAPPY,
8383
"gzip" -> CompressionCodecName.GZIP,
8484
"lzo" -> CompressionCodecName.LZO)
85+
86+
def getParquetCompressionCodecName(name: String): String = {
87+
shortParquetCompressionCodecNames(name).name()
88+
}
8589
}

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,16 @@ package org.apache.spark.sql.hive.execution
1919

2020
import java.util.Locale
2121

22+
import scala.collection.JavaConverters._
23+
24+
import org.apache.hadoop.hive.ql.plan.TableDesc
25+
import org.apache.orc.OrcConf.COMPRESS
26+
import org.apache.parquet.hadoop.ParquetOutputFormat
27+
2228
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
29+
import org.apache.spark.sql.execution.datasources.orc.OrcOptions
30+
import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
31+
import org.apache.spark.sql.internal.SQLConf
2332

2433
/**
2534
* Options for the Hive data source. Note that rule `DetermineHiveSerde` will extract Hive
@@ -102,4 +111,17 @@ object HiveOptions {
102111
"collectionDelim" -> "colelction.delim",
103112
"mapkeyDelim" -> "mapkey.delim",
104113
"lineDelim" -> "line.delim").map { case (k, v) => k.toLowerCase(Locale.ROOT) -> v }
114+
115+
def getHiveWriteCompression(tableInfo: TableDesc, sqlConf: SQLConf): Option[(String, String)] = {
116+
val tableProps = tableInfo.getProperties.asScala.toMap
117+
tableInfo.getOutputFileFormatClassName.toLowerCase(Locale.ROOT) match {
118+
case formatName if formatName.endsWith("parquetoutputformat") =>
119+
val compressionCodec = new ParquetOptions(tableProps, sqlConf).compressionCodecClassName
120+
Option((ParquetOutputFormat.COMPRESSION, compressionCodec))
121+
case formatName if formatName.endsWith("orcoutputformat") =>
122+
val compressionCodec = new OrcOptions(tableProps, sqlConf).compressionCodec
123+
Option((COMPRESS.getAttribute, compressionCodec))
124+
case _ => None
125+
}
126+
}
105127
}

sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,28 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
5555
customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty,
5656
partitionAttributes: Seq[Attribute] = Nil): Set[String] = {
5757

58-
val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean
58+
val isCompressed =
59+
fileSinkConf.getTableInfo.getOutputFileFormatClassName.toLowerCase(Locale.ROOT) match {
60+
case formatName if formatName.endsWith("orcoutputformat") =>
61+
// For ORC,"mapreduce.output.fileoutputformat.compress",
62+
// "mapreduce.output.fileoutputformat.compress.codec", and
63+
// "mapreduce.output.fileoutputformat.compress.type"
64+
// have no impact because it uses table properties to store compression information.
65+
false
66+
case _ => hadoopConf.get("hive.exec.compress.output", "false").toBoolean
67+
}
68+
5969
if (isCompressed) {
60-
// Please note that isCompressed, "mapreduce.output.fileoutputformat.compress",
61-
// "mapreduce.output.fileoutputformat.compress.codec", and
62-
// "mapreduce.output.fileoutputformat.compress.type"
63-
// have no impact on ORC because it uses table properties to store compression information.
6470
hadoopConf.set("mapreduce.output.fileoutputformat.compress", "true")
6571
fileSinkConf.setCompressed(true)
6672
fileSinkConf.setCompressCodec(hadoopConf
6773
.get("mapreduce.output.fileoutputformat.compress.codec"))
6874
fileSinkConf.setCompressType(hadoopConf
6975
.get("mapreduce.output.fileoutputformat.compress.type"))
76+
} else {
77+
// Set compression by priority
78+
HiveOptions.getHiveWriteCompression(fileSinkConf.getTableInfo, sparkSession.sessionState.conf)
79+
.foreach { case (compression, codec) => hadoopConf.set(compression, codec) }
7080
}
7181

7282
val committer = FileCommitProtocol.instantiate(

0 commit comments

Comments
 (0)