apache · mgaido91 · Dec 8, 2017 · Dec 22, 2017 · Dec 22, 2017 · Dec 23, 2017
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2075,9 +2075,10 @@ class PandasUDFType(object):
 def udf(f=None, returnType=StringType()):
     """Creates a user defined function (UDF).
 
-    .. note:: The user-defined functions must be deterministic. Due to optimization,
+    .. note:: The user-defined functions are considered deterministic. Due to optimization,
         duplicate invocations may be eliminated or the function may even be invoked more times than
-        it is present in the query.
+        it is present in the query. If your function is not deterministic, call
+        `asNondeterministic`.
 
     .. note:: The user-defined functions do not support conditional expressions or short curcuiting
         in boolean expressions and it ends up with being executed all internally. If the functions

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -434,6 +434,16 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        random.seed(1234)
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)

diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
@@ -88,6 +88,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
+        self._deterministic = True
 
     @property
     def returnType(self):
@@ -125,7 +126,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType)
+            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -157,5 +158,13 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
+        wrapper.asNondeterministic = self.asNondeterministic
 
         return wrapper
+
+    def asNondeterministic(self):
+        """
+        Updates UserDefinedFunction to nondeterministic.
+        """
+        self._deterministic = False
+        return self
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -58,6 +58,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         | pythonIncludes: ${udf.func.pythonIncludes}
         | pythonExec: ${udf.func.pythonExec}
         | dataType: ${udf.dataType}
+        | udfDeterministic: ${udf.udfDeterministic}
       """.stripMargin)
 
     functionRegistry.createOrReplaceTempFunction(name, udf.builder)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -29,9 +29,12 @@ case class PythonUDF(
     func: PythonFunction,
     dataType: DataType,
     children: Seq[Expression],
-    evalType: Int)
+    evalType: Int,
+    udfDeterministic: Boolean)
   extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
+  override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+
   override def toString: String = s"$name(${children.mkString(", ")})"
 
   override def nullable: Boolean = true

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/...core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -29,10 +29,11 @@ case class UserDefinedPythonFunction(
     name: String,
     func: PythonFunction,
     dataType: DataType,
-    pythonEvalType: Int) {
+    pythonEvalType: Int,
+    udfDeterministic: Boolean = true) {
 
   def builder(e: Seq[Expression]): PythonUDF = {
-    PythonUDF(name, func, dataType, e, pythonEvalType)
+    PythonUDF(name, func, dataType, e, pythonEvalType, udfDeterministic)
   }
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */