SPARK-7118 Add the coalesce Spark SQL function available in PySpark

Olivier Girardot · Olivier Girardot · commit e3fec1e76eaa · 2015-04-24T10:47:57.000+02:00
No changes to the scala/java part, only changes in Python.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -30,7 +30,7 @@
 from pyspark.sql.dataframe import Column, _to_java_column, _to_seq
 
 
-__all__ = ['countDistinct', 'approxCountDistinct', 'udf']
+__all__ = ['countDistinct', 'approxCountDistinct', 'udf', 'coalesce']
 
 
 def _create_function(name, doc=""):
@@ -75,6 +75,26 @@ def _(col):
 __all__.sort()
 
 
+def coalesce(*cols):
+    """Returns the first column that is not null.
+
+    >>> df.select(coalesce(df["a"], df["b"])).show()
+    Coalesce(a,b)
+    1
+    3
+    5
+
+    >>> df.select('*', coalesce(df["a"], lit(0.0))).show()
+    a    b    Coalesce(a,0.0)
+    1    2    1.0
+    null 3    0.0
+    5    null 5.0
+    """
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
+    return Column(jc)
+
+
 def countDistinct(col, *cols):
     """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.