apache · chetkhatri · Dec 19, 2017 · Dec 19, 2017 · Dec 20, 2017 · Dec 20, 2017
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -19,8 +19,7 @@ package org.apache.spark.examples.sql.hive
 // $example on:spark_hive$
 import java.io.File
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{Row, SaveMode, SparkSession}
 // $example off:spark_hive$
 
 object SparkHiveExample {
@@ -104,6 +103,60 @@ object SparkHiveExample {
     // ...
     // $example off:spark_hive$
 
+    // to save DataFrame to Hive Managed table as Parquet format
+
+    /*
+     * 1. Create Hive Database / Schema with location at HDFS if you want to mentioned explicitly else default
+     * warehouse location will be used to store Hive table Data.
+     * Ex: CREATE DATABASE IF NOT EXISTS database_name LOCATION hdfs_path;
+     * You don't have to explicitly give location for each table, every tables under specified schema will be located at
+     * location given while creating schema.
+     * 2. Create Hive Managed table with storage format as 'Parquet'
+     * Ex: CREATE TABLE records(key int, value string) STORED AS PARQUET;
+     */
+    val hiveTableDF = sql("SELECT * FROM records").toDF()
+    hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records")
+
+    // to save DataFrame to Hive External table as compatible parquet format.
+    /*
+     * 1. Create Hive External table with storage format as parquet.
+     * Ex: CREATE EXTERNAL TABLE records(key int, value string) STORED AS PARQUET;
+     * Since we are not explicitly providing hive database location, it automatically takes default warehouse location
+     * given to 'spark.sql.warehouse.dir' while creating SparkSession with enableHiveSupport().
+     * For example, we have given '/user/hive/warehouse/' as a Hive Warehouse location. It will create schema directories
+     * under '/user/hive/warehouse/' as '/user/hive/warehouse/database_name.db' and '/user/hive/warehouse/database_name'.
+     */
+
+    // to make Hive parquet format compatible with spark parquet format
+    spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
+    // Multiple parquet files could be created accordingly to volume of data under directory given.
+    val hiveExternalTableLocation = s"/user/hive/warehouse/database_name.db/records"
+    hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation)
+
+    // turn on flag for Dynamic Partitioning
+    spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
+    spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
+    // You can create partitions in Hive table, so downstream queries run much faster.
+    hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key")
+      .parquet(hiveExternalTableLocation)
+    /*
+    If Data volume is very huge, then every partitions would have many small-small files which may harm
+    downstream query performance due to File I/O, Bandwidth I/O, Network I/O, Disk I/O.
+    To improve performance you can create single parquet file under each partition directory using 'repartition'
+    on partitioned key for Hive table.
+     */
+    hiveTableDF.repartition($"key").write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
+
+    /*
+     You can also do coalesce to control number of files under each partitions, repartition does full shuffle and equal
+     data distribution to all partitions. here coalesce can reduce number of files to given 'Int' argument without
+     full data shuffle.
+     */
+    // coalesce of 10 could create 10 parquet files under each partitions,
+    // if data is huge and make sense to do partitioning.
+    hiveTableDF.coalesce(10).write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
     spark.stop()
   }
 }