fix.

gatorsmile · gatorsmile · commit 09a1b89cd443 · 2018-01-04T11:41:58.000+08:00
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
@@ -227,15 +227,15 @@ def dropGlobalTempView(self, viewName):
     @ignore_unicode_prefix
     @since(2.0)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
+        """Registers a Python function (including lambda function) or a wrapped/native UDF
         so it can be used in SQL statements.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -260,14 +260,14 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> from pyspark.sql.functions import udf
         >>> from pyspark.sql.types import IntegerType, StringType
         >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> newRandom_udf = spark.catalog.registerFunction(
-        ...     "random_udf", random_udf, StringType())  # doctest: +SKIP
+        >>> newRandom_udf = spark.catalog.registerFunction("random_udf", random_udf, StringType())
         >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
         [Row(random_udf()=u'82')]
         >>> spark.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
         [Row(random_udf()=u'62')]
         """
 
+        # This is to check whether the input function is a wrapped/native UserDefinedFunction
         if hasattr(f, 'asNondeterministic'):
             udf = UserDefinedFunction(f.func, returnType=returnType, name=name,
                                       evalType=PythonEvalType.SQL_BATCHED_UDF,
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -175,15 +175,15 @@ def range(self, start, end=None, step=1, numPartitions=None):
     @ignore_unicode_prefix
     @since(1.2)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
+        """Registers a Python function (including lambda function) or a wrapped/native UDF
         so it can be used in SQL statements.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -208,8 +208,7 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> from pyspark.sql.functions import udf
         >>> from pyspark.sql.types import IntegerType, StringType
         >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
-        >>> newRandom_udf = sqlContext.registerFunction(
-        ...     "random_udf", random_udf, StringType())  # doctest: +SKIP
+        >>> newRandom_udf = sqlContext.registerFunction("random_udf", random_udf, StringType())
         >>> sqlContext.sql("SELECT random_udf()").collect()  # doctest: +SKIP
         [Row(random_udf()=u'82')]
         >>> sqlContext.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -378,7 +378,24 @@ def test_udf2(self):
         [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
 
-    def test_non_deterministic_udf(self):
+    def test_udf3(self):
+        twoargs = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y), IntegerType())
+        self.assertEqual(twoargs.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], 5)
+
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        self.assertEqual(udf_random_col.deterministic, False)
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
+    def test_nondeterministic_udf2(self):
         import random
         from pyspark.sql.functions import udf
         random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic()
@@ -391,6 +408,7 @@ def test_non_deterministic_udf(self):
         self.assertEqual(row[0], "6")
         [row] = self.spark.range(1).select(random_udf()).collect()
         self.assertEqual(row[0], 6)
+        # render_doc() reproduces the help() exception without printing output
         pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
         pydoc.render_doc(random_udf)
         pydoc.render_doc(random_udf1)
@@ -452,15 +470,6 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
-    def test_nondeterministic_udf(self):
-        from pyspark.sql.functions import udf
-        import random
-        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
-        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
-        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
-        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
-        self.assertEqual(row[0] + 10, row[1])
-
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)