apache · mgaido91 · Dec 8, 2017 · Dec 22, 2017 · Dec 22, 2017 · Dec 23, 2017
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -39,6 +39,13 @@ private[spark] object PythonEvalType {
 
   val SQL_PANDAS_SCALAR_UDF = 200
   val SQL_PANDAS_GROUP_MAP_UDF = 201
+
+  def toString(pythonEvalType: Int): String = pythonEvalType match {
+    case NON_UDF => "NON_UDF"
+    case SQL_BATCHED_UDF => "SQL_BATCHED_UDF"
+    case SQL_PANDAS_SCALAR_UDF => "SQL_PANDAS_SCALAR_UDF"
+    case SQL_PANDAS_GROUP_MAP_UDF => "SQL_PANDAS_GROUP_MAP_UDF"
+  }
 }
 
 /**

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2075,9 +2075,14 @@ class PandasUDFType(object):
 def udf(f=None, returnType=StringType()):
     """Creates a user defined function (UDF).
 
-    .. note:: The user-defined functions must be deterministic. Due to optimization,
-        duplicate invocations may be eliminated or the function may even be invoked more times than
-        it is present in the query.
+    .. note:: The user-defined functions are considered deterministic by default. Due to
+        optimization, duplicate invocations may be eliminated or the function may even be invoked
+        more times than it is present in the query. If your function is not deterministic, call
+        `asNondeterministic` on the user defined function. E.g.:
+
+    >>> from pyspark.sql.types import IntegerType
+    >>> import random
+    >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
 
     .. note:: The user-defined functions do not support conditional expressions or short curcuiting
         in boolean expressions and it ends up with being executed all internally. If the functions

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -434,6 +434,15 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)

diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
@@ -88,6 +88,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
+        self._deterministic = True
 
     @property
     def returnType(self):
@@ -125,7 +126,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType)
+            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -157,5 +158,15 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
+        wrapper.asNondeterministic = self.asNondeterministic
 
         return wrapper
+
+    def asNondeterministic(self):
+        """
+        Updates UserDefinedFunction to nondeterministic.
+
+        .. versionadded:: 2.3
+        """
+        self._deterministic = False
+        return self
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -23,6 +23,7 @@ import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection}
@@ -41,8 +42,6 @@ import org.apache.spark.util.Utils
  *   spark.udf
  * }}}
  *
- * @note The user-defined functions must be deterministic.
- *
  * @since 1.3.0
  */
 @InterfaceStability.Stable
@@ -58,6 +57,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         | pythonIncludes: ${udf.func.pythonIncludes}
         | pythonExec: ${udf.func.pythonExec}
         | dataType: ${udf.dataType}
+        | pythonEvalType: ${PythonEvalType.toString(udf.pythonEvalType)}
+        | udfDeterministic: ${udf.udfDeterministic}
       """.stripMargin)
 
     functionRegistry.createOrReplaceTempFunction(name, udf.builder)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -29,9 +29,12 @@ case class PythonUDF(
     func: PythonFunction,
     dataType: DataType,
     children: Seq[Expression],
-    evalType: Int)
+    evalType: Int,
+    udfDeterministic: Boolean)
   extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
+  override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+
   override def toString: String = s"$name(${children.mkString(", ")})"
 
   override def nullable: Boolean = true

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/...core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -29,10 +29,11 @@ case class UserDefinedPythonFunction(
     name: String,
     func: PythonFunction,
     dataType: DataType,
-    pythonEvalType: Int) {
+    pythonEvalType: Int,
+    udfDeterministic: Boolean) {
 
   def builder(e: Seq[Expression]): PythonUDF = {
-    PythonUDF(name, func, dataType, e, pythonEvalType)
+    PythonUDF(name, func, dataType, e, pythonEvalType, udfDeterministic)
   }
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -109,4 +109,5 @@ class MyDummyPythonUDF extends UserDefinedPythonFunction(
   name = "dummyUDF",
   func = new DummyUDF,
   dataType = BooleanType,
-  pythonEvalType = PythonEvalType.SQL_BATCHED_UDF)
+  pythonEvalType = PythonEvalType.SQL_BATCHED_UDF,
+  udfDeterministic = true)