apache · chetkhatri · Dec 19, 2017 · Dec 19, 2017 · Dec 20, 2017 · Dec 20, 2017
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -19,8 +19,7 @@ package org.apache.spark.examples.sql.hive
 // $example on:spark_hive$
 import java.io.File
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{Row, SaveMode, SparkSession}
 // $example off:spark_hive$
 
 object SparkHiveExample {
@@ -102,8 +101,41 @@ object SparkHiveExample {
     // |  4| val_4|  4| val_4|
     // |  5| val_5|  5| val_5|
     // ...
-    // $example off:spark_hive$
 
+    // Create Hive managed table with parquet
+    sql("CREATE TABLE records(key int, value string) STORED AS PARQUET")
+    // Save DataFrame to Hive Managed table as Parquet format
+    val hiveTableDF = sql("SELECT * FROM records")
+    hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records")
+    // Create External Hive table with parquet
+    sql("CREATE EXTERNAL TABLE records(key int, value string) " +
+      "STORED AS PARQUET LOCATION '/user/hive/warehouse/'")
+    // to make Hive parquet format compatible with spark parquet format
+    spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
+
+    // Multiple parquet files could be created accordingly to volume of data under directory given.
+    val hiveExternalTableLocation = "/user/hive/warehouse/database_name.db/records"
+
+    // Save DataFrame to Hive External table as compatible parquet format
+    hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation)
+
+    // turn on flag for Dynamic Partitioning
+    spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
+    spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
+
+    // You can create partitions in Hive table, so downstream queries run much faster.
+    hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key")
+      .parquet(hiveExternalTableLocation)
+
+    // reduce number of files for each partition by repartition
+    hiveTableDF.repartition($"key").write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
+
+    // Control number of files in each partition by coalesce
+    hiveTableDF.coalesce(10).write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
+    // $example off:spark_hive$
+
     spark.stop()
   }
 }