apache · jiayue-zhang · Dec 9, 2016 · Dec 9, 2016 · Dec 9, 2016 · Dec 9, 2016
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1362,8 +1362,8 @@ def replace(self, to_replace, value=None, subset=None):
         """Returns a new :class:`DataFrame` replacing a value with another value.
         :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are
         aliases of each other.
-        Values to_replace and value should contain either all numerics, all booleans,
-        or all strings. When replacing, the new value will be cast
+        Values to_replace and value must have the same type and can only be numerics, booleans,
+        or strings. Value can have None. When replacing, the new value will be cast
         to the type of the existing column.
         For numeric replacements all values to be replaced should have unique
         floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)
@@ -1373,8 +1373,8 @@ def replace(self, to_replace, value=None, subset=None):
             Value to be replaced.
             If the value is a dict, then `value` is ignored and `to_replace` must be a
             mapping between a value and a replacement.
-        :param value: int, long, float, string, or list.
-            The replacement value must be an int, long, float, or string. If `value` is a
+        :param value: int, long, float, string, list or None.
+            The replacement value must be an int, long, float, string or None. If `value` is a
             list, `value` should be of the same length and type as `to_replace`.
             If `value` is a scalar and `to_replace` is a sequence, then `value` is
             used as a replacement for each item in `to_replace`.
@@ -1393,6 +1393,16 @@ def replace(self, to_replace, value=None, subset=None):
         |null|  null| null|
         +----+------+-----+
 
+        >>> df4.na.replace('Alice', None).show()
+        +----+------+----+
+        | age|height|name|
+        +----+------+----+
+        |  10|    80|null|
+        |   5|  null| Bob|
+        |null|  null| Tom|
+        |null|  null|null|
+        +----+------+----+
+
         >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()
         +----+------+----+
         | age|height|name|
@@ -1428,9 +1438,9 @@ def all_of_(xs):
                 "to_replace should be a float, int, long, string, list, tuple, or dict. "
                 "Got {0}".format(type(to_replace)))
 
-        if not isinstance(value, valid_types) and not isinstance(to_replace, dict):
+        if not isinstance(value, valid_types + (type(None), )) and not isinstance(to_replace, dict):
             raise ValueError("If to_replace is not a dict, value should be "
-                             "a float, int, long, string, list, or tuple. "
+                             "a float, int, long, string, list, tuple or None. "
                              "Got {0}".format(type(value)))
 
         if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):
@@ -1446,7 +1456,7 @@ def all_of_(xs):
         if isinstance(to_replace, (float, int, long, basestring)):
             to_replace = [to_replace]
 
-        if isinstance(value, (float, int, long, basestring)):
+        if isinstance(value, (float, int, long, basestring, type(None))):
             value = [value for _ in range(len(to_replace))]
 
         if isinstance(to_replace, dict):
@@ -1460,7 +1470,8 @@ def all_of_(xs):
             subset = [subset]
 
         # Verify we were not passed in mixed type generics."
-        if not any(all_of_type(rep_dict.keys()) and all_of_type(rep_dict.values())
+        if not any(all_of_type(rep_dict.keys())
+                   and all_of_type(x for x in rep_dict.values() if x is not None)
                    for all_of_type in [all_of_bool, all_of_str, all_of_numeric]):
             raise ValueError("Mixed type replacements are not supported")
 

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -1964,6 +1964,16 @@ def test_replace(self):
                .replace(False, True).first())
         self.assertTupleEqual(row, (True, True))
 
+        # replace with None
+        row = self.spark.createDataFrame(
+            [(u'Alice', 10, 80.0)], schema).replace(u'Alice', None).first()
+        self.assertTupleEqual(row, (None, 10, 80.0))
+
+        # replace with numerics and None
+        row = self.spark.createDataFrame(
+            [(u'Alice', 10, 80.0)], schema).replace([10, 80], [20, None]).first()
+        self.assertTupleEqual(row, (u'Alice', 20, None))
+
         # should fail if subset is not list, tuple or None
         with self.assertRaises(ValueError):
             self.spark.createDataFrame(

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -145,8 +145,8 @@ class DataTypeSuite extends SparkFunSuite {
     val message = intercept[SparkException] {
       left.merge(right)
     }.getMessage
-    assert(message.equals("Failed to merge fields 'b' and 'b'. " +
-      "Failed to merge incompatible data types FloatType and LongType"))
+    assert(message === "Failed to merge fields 'b' and 'b'. " +
+      "Failed to merge incompatible data types FloatType and LongType")
   }
 
   test("existsRecursively") {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -262,6 +262,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * Replaces values matching keys in `replacement` map with the corresponding values.
    * Key and value of `replacement` map must have the same type, and
    * can only be doubles, strings or booleans.
+   * `replacement` map value can have null.
    * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
    *
    * {{{
@@ -290,6 +291,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * Replaces values matching keys in `replacement` map with the corresponding values.
    * Key and value of `replacement` map must have the same type, and
    * can only be doubles, strings or booleans.
+   * `replacement` map value can have null.
    *
    * {{{
    *   import com.google.common.collect.ImmutableMap;
@@ -314,6 +316,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * (Scala-specific) Replaces values matching keys in `replacement` map.
    * Key and value of `replacement` map must have the same type, and
    * can only be doubles, strings or booleans.
+   * `replacement` map value can have null.
    * If `col` is "*",
    * then the replacement is applied on all string columns , numeric columns or boolean columns.
    *
@@ -344,7 +347,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
    * Key and value of `replacement` map must have the same type, and
-   * can only be doubles , strings or booleans.
+   * can only be doubles, strings or booleans.
+   * `replacement` map value can have null.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
@@ -366,11 +370,15 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       return df
     }
 
-    // replacementMap is either Map[String, String] or Map[Double, Double] or Map[Boolean,Boolean]
-    val replacementMap: Map[_, _] = replacement.head._2 match {
-      case v: String => replacement
-      case v: Boolean => replacement
-      case _ => replacement.map { case (k, v) => (convertToDouble(k), convertToDouble(v)) }
+    // replacementMap is either Map[String, String], Map[Double, Double], Map[Boolean,Boolean]
+    // while value can have null
+    val replacementMap: Map[_, _] = replacement.map {
+      case (k, v: String) => (k, v)
+      case (k, v: Boolean) => (k, v)
+      case (k: String, null) => (k, null)
+      case (k: Boolean, null) => (k, null)
+      case (k, null) => (convertToDouble(k), null)
+      case _ @(k, v) => (convertToDouble(k), convertToDouble(v))
     }
 
     // targetColumnType is either DoubleType or StringType or BooleanType

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -248,16 +248,16 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(out(4) === Row("Amy", null, null))
     assert(out(5) === Row(null, null, null))
 
-    // Replace only the age column
-    val out1 = input.na.replace("age", Map(
+    // Replace only the age column and with null
+    val out1 = input.na.replace("age", Map[Any, Any](
       16 -> 61,
-      60 -> 6,
+      60 -> null,
       164.3 -> 461.3  // Alice is really tall
     )).collect()
 
     assert(out1(0) === Row("Bob", 61, 176.5))
     assert(out1(1) === Row("Alice", null, 164.3))
-    assert(out1(2) === Row("David", 6, null))
+    assert(out1(2) === Row("David", null, null))
     assert(out1(3).get(2).asInstanceOf[Double].isNaN)
     assert(out1(4) === Row("Amy", null, null))
     assert(out1(5) === Row(null, null, null))