From bc87b372e2d359b6e2d102ab0f5eebfb65f54310 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Tue, 7 Apr 2020 13:01:25 +0900 Subject: [PATCH] Fix --- docs/sql-ref-functions-udf-hive.md | 117 ++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 37 deletions(-) diff --git a/docs/sql-ref-functions-udf-hive.md b/docs/sql-ref-functions-udf-hive.md index 3beb13ba18e01..a3c21f78a6ce9 100644 --- a/docs/sql-ref-functions-udf-hive.md +++ b/docs/sql-ref-functions-udf-hive.md @@ -25,45 +25,88 @@ Spark SQL supports integration of Hive UDFs, UDAFs and UDTFs. Similar to Spark U ### Examples +Hive has two UDF interfaces: [UDF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDF.java) and [GenericUDF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). +An example below uses [GenericUDFAbs](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAbs.java) derived from `GenericUDF`. + +

+// Register `GenericUDFAbs` and use it in Spark SQL.
+// Note that, if you use your own programmed one, you need to add a JAR containig it into a classpath,
+// e.g., `spark.sql("ADD JAR yourHiveUDF.jar")`.
+spark.sql("CREATE TEMPORARY FUNCTION testUDF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs'")
+
+spark.sql("SELECT * FROM hiveUDFTestTable").show()
+// +-----+
+// |value|
+// +-----+
+// | -1.0|
+// |  2.0|
+// | -3.0|
+// +-----+
+
+spark.sql("SELECT testUDF(value) FROM t").show()
+// +--------------+
+// |testUDF(value)|
+// +--------------+
+// |           1.0|
+// |           2.0|
+// |           3.0|
+// +--------------+
+
+ +An example below uses [GenericUDTFExplode](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFExplode.java) derived from [GenericUDTF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). +

-// Register a Hive UDF and use it in Spark SQL
-// Scala
-// include the JAR file containing mytest.hiveUDF implementation
-spark.sql("ADD JAR myHiveUDF.jar")
-spark.sql("CREATE TEMPORARY FUNCTION testUDF AS 'mytest.hiveUDF'")
-spark.sql("SELECT testUDF(value) FROM hiveUDFTestTable")
-
-// Register a Hive UDAF and use it in Spark SQL
-// Scala
-// include the JAR file containing
-// org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax
-spark.sql("ADD JAR myHiveUDAF.jar")
+// Register `GenericUDTFExplode` and use it in Spark SQL
 spark.sql(
-          """
-            |CREATE TEMPORARY FUNCTION hive_max
-            |AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax'
-          """.stripMargin)
-spark.sql("SELECT key % 2, hive_max(key) FROM t GROUP BY key % 2")
-
-// Register a Hive UDTF and use it in Spark SQL
-// Scala
-// GenericUDTFCount2 outputs the number of rows seen, twice.
-// The function source code can be found at:
-// https://cwiki.apache.org/confluence/display/Hive/DeveloperGuide+UDTF
-// include the JAR file containing GenericUDTFCount2 implementation
-spark.sql("ADD JAR myHiveUDTF.jar")
+  """
+    |CREATE TEMPORARY FUNCTION hiveUDTF
+    |    AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDTFExplode'
+  """.stripMargin)
+
+spark.sql("SELECT * FROM t").show()
+// +------+
+// | value|
+// +------+
+// |[1, 2]|
+// |[3, 4]|
+// +------+
+
+spark.sql("SELECT hiveUDTF(value) FROM t").show()
+// +---+
+// |col|
+// +---+
+// |  1|
+// |  2|
+// |  3|
+// |  4|
+// +---+
+
+ +Hive has two UDAF interfaces: [UDAF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDAF.java) and [GenericUDAFResolver](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java). +An example below uses [GenericUDAFSum](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java) derived from `GenericUDAFResolver`. + +

+// Register `GenericUDAFSum` and use it in Spark SQL
 spark.sql(
-          """
-            |CREATE TEMPORARY FUNCTION udtf_count2
-            |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2'
-          """.stripMargin)
-spark.sql("SELECT udtf_count2(a) FROM (SELECT 1 AS a)").show
-
-+----+
-|col1|
-+----+
-|   1|
-|   1|
-+----+
+  """
+    |CREATE TEMPORARY FUNCTION hiveUDAF
+    |    AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum'
+  """.stripMargin)
+
+spark.sql("SELECT * FROM t").show()
+// +---+-----+
+// |key|value|
+// +---+-----+
+// |  a|    1|
+// |  a|    2|
+// |  b|    3|
+// +---+-----+
 
+spark.sql("SELECT key, hiveUDAF(value) FROM t GROUP BY key").show()
+// +---+---------------+
+// |key|hiveUDAF(value)|
+// +---+---------------+
+// |  b|              3|
+// |  a|              3|
+// +---+---------------+