SPARK-7118 Add the coalesce Spark SQL function available in PySpark

Olivier Girardot · Olivier Girardot · commit d9a4439856c8 · 2015-05-07T11:57:41.000+01:00
No changes to the scala/java part, only changes in Python.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -37,6 +37,7 @@
     'rand',
     'randn',
     'sparkPartitionId',
+    'coalesce',
     'udf']
 
 
@@ -167,6 +168,42 @@ def approxCountDistinct(col, rsd=None):
     return Column(jc)
 
 
+def coalesce(*cols):
+    """Returns the first column that is not null.
+
+    >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
+    >>> cDf.show()
+    +----+----+
+    |   a|   b|
+    +----+----+
+    |null|null|
+    |   1|null|
+    |null|   2|
+    +----+----+
+
+    >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+    +-------------+
+    |Coalesce(a,b)|
+    +-------------+
+    |         null|
+    |            1|
+    |            2|
+    +-------------+
+
+    >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+    +----+----+---------------+
+    |   a|   b|Coalesce(a,0.0)|
+    +----+----+---------------+
+    |null|null|            0.0|
+    |   1|null|            1.0|
+    |null|   2|            0.0|
+    +----+----+---------------+
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
+    return Column(jc)
+
+
 def countDistinct(col, *cols):
     """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.