Implement DataFrame.where() & DataFrame.mask() (#1018)

itholic · HyukjinKwon · commit 7070bc6e29dd · 2019-11-22T10:44:59.000+09:00
Resolves #884 This PR implement `where` of `DataFrame` (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.where.html#pandas.DataFrame.where) and `mask` of `DataFrame` (same as where except for the opposite cond) (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mask.html#pandas.DataFrame.mask) ```python >>> df1 = ks.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]}) >>> df2 = ks.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]}) >>> df1 A B 0 0 100 1 1 200 2 2 300 3 3 400 4 4 500 >>> df2 A B 0 0 -100 1 -1 -200 2 -2 -300 3 -3 -400 4 -4 -500 >>> df1.where(df1 > 0).sort_index() A B 0 NaN 100.0 1 1.0 200.0 2 2.0 300.0 3 3.0 400.0 4 4.0 500.0 >>> df1.where(df1 > 1, 10).sort_index() A B 0 10 100 1 10 200 2 2 300 3 3 400 4 4 500 >>> df1.where(df1 > 1, df1 + 100).sort_index() A B 0 100 100 1 101 200 2 2 300 3 3 400 4 4 500 >>> df1.where(df1 > 1, df2).sort_index() A B 0 0 100 1 -1 200 2 2 300 3 3 400 4 4 500 ```
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -2042,6 +2042,249 @@ class  locomotion
 
         return result
 
+    def where(self, cond, other=np.nan):
+        """
+        Replace values where the condition is False.
+
+        Parameters
+        ----------
+        cond : boolean DataFrame
+            Where cond is True, keep the original value. Where False,
+            replace with corresponding value from other.
+        other : scalar, DataFrame
+            Entries where cond is False are replaced with corresponding value from other.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+
+        >>> from databricks.koalas.config import set_option, reset_option
+        >>> set_option("compute.ops_on_diff_frames", True)
+        >>> df1 = ks.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]})
+        >>> df2 = ks.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]})
+        >>> df1
+           A    B
+        0  0  100
+        1  1  200
+        2  2  300
+        3  3  400
+        4  4  500
+        >>> df2
+           A    B
+        0  0 -100
+        1 -1 -200
+        2 -2 -300
+        3 -3 -400
+        4 -4 -500
+
+        >>> df1.where(df1 > 0).sort_index()
+             A      B
+        0  NaN  100.0
+        1  1.0  200.0
+        2  2.0  300.0
+        3  3.0  400.0
+        4  4.0  500.0
+
+        >>> df1.where(df1 > 1, 10).sort_index()
+            A    B
+        0  10  100
+        1  10  200
+        2   2  300
+        3   3  400
+        4   4  500
+
+        >>> df1.where(df1 > 1, df1 + 100).sort_index()
+             A    B
+        0  100  100
+        1  101  200
+        2    2  300
+        3    3  400
+        4    4  500
+
+        >>> df1.where(df1 > 1, df2).sort_index()
+           A    B
+        0  0  100
+        1 -1  200
+        2  2  300
+        3  3  400
+        4  4  500
+
+        When the column name of cond is different from self, it treats all values are False
+
+        >>> cond = ks.DataFrame({'C': [0, -1, -2, -3, -4], 'D':[4, 3, 2, 1, 0]}) % 3 == 0
+        >>> cond
+               C      D
+        0   True  False
+        1  False   True
+        2  False  False
+        3   True  False
+        4  False   True
+
+        >>> df1.where(cond).sort_index()
+            A   B
+        0 NaN NaN
+        1 NaN NaN
+        2 NaN NaN
+        3 NaN NaN
+        4 NaN NaN
+
+        When the type of cond is Series, it just check boolean regardless of column name
+
+        >>> cond = ks.Series([1, 2]) > 1
+        >>> cond
+        0    False
+        1     True
+        Name: 0, dtype: bool
+
+        >>> df1.where(cond).sort_index()
+             A      B
+        0  NaN    NaN
+        1  1.0  200.0
+        2  NaN    NaN
+        3  NaN    NaN
+        4  NaN    NaN
+
+        >>> reset_option("compute.ops_on_diff_frames")
+        """
+        from databricks.koalas.series import Series
+        tmp_cond_col_name = '__tmp_cond_col_{}__'
+        tmp_other_col_name = '__tmp_other_col_{}__'
+        kdf = self.copy()
+        if isinstance(cond, DataFrame):
+            for column in self._internal.data_columns:
+                kdf[tmp_cond_col_name.format(column)] = cond.get(column, False)
+        elif isinstance(cond, Series):
+            for column in self._internal.data_columns:
+                kdf[tmp_cond_col_name.format(column)] = cond
+        else:
+            raise ValueError("type of cond must be a DataFrame or Series")
+
+        if isinstance(other, DataFrame):
+            for column in self._internal.data_columns:
+                kdf[tmp_other_col_name.format(column)] = other.get(column, np.nan)
+        else:
+            for column in self._internal.data_columns:
+                kdf[tmp_other_col_name.format(column)] = other
+
+        sdf = kdf._sdf
+        # above logic make spark dataframe looks like below:
+        # +-----------------+---+---+------------------+-------------------+------------------+--...
+        # |__index_level_0__|  A|  B|__tmp_cond_col_A__|__tmp_other_col_A__|__tmp_cond_col_B__|__...
+        # +-----------------+---+---+------------------+-------------------+------------------+--...
+        # |                0|  0|100|              true|                  0|             false|  ...
+        # |                1|  1|200|             false|                 -1|             false|  ...
+        # |                3|  3|400|              true|                 -3|             false|  ...
+        # |                2|  2|300|             false|                 -2|              true|  ...
+        # |                4|  4|500|             false|                 -4|             false|  ...
+        # +-----------------+---+---+------------------+-------------------+------------------+--...
+
+        output = []
+        for column in self._internal.data_columns:
+            data_col_name = self._internal.column_name_for(column)
+            output.append(
+                F.when(
+                    sdf[tmp_cond_col_name.format(column)], sdf[data_col_name]
+                ).otherwise(
+                    sdf[tmp_other_col_name.format(column)]
+                ).alias(data_col_name))
+
+        index_columns = self._internal.index_columns
+        sdf = sdf.select(*index_columns, *output)
+
+        return DataFrame(self._internal.copy(
+            sdf=sdf,
+            column_scols=[scol_for(sdf, column) for column in self._internal.data_columns]))
+
+    def mask(self, cond, other=np.nan):
+        """
+        Replace values where the condition is True.
+
+        Parameters
+        ----------
+        cond : boolean DataFrame
+            Where cond is False, keep the original value. Where True,
+            replace with corresponding value from other.
+        other : scalar, DataFrame
+            Entries where cond is True are replaced with corresponding value from other.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+
+        >>> from databricks.koalas.config import set_option, reset_option
+        >>> set_option("compute.ops_on_diff_frames", True)
+        >>> df1 = ks.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]})
+        >>> df2 = ks.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]})
+        >>> df1
+           A    B
+        0  0  100
+        1  1  200
+        2  2  300
+        3  3  400
+        4  4  500
+        >>> df2
+           A    B
+        0  0 -100
+        1 -1 -200
+        2 -2 -300
+        3 -3 -400
+        4 -4 -500
+
+        >>> df1.mask(df1 > 0).sort_index()
+             A   B
+        0  0.0 NaN
+        1  NaN NaN
+        2  NaN NaN
+        3  NaN NaN
+        4  NaN NaN
+
+        >>> df1.mask(df1 > 1, 10).sort_index()
+            A   B
+        0   0  10
+        1   1  10
+        2  10  10
+        3  10  10
+        4  10  10
+
+        >>> df1.mask(df1 > 1, df1 + 100).sort_index()
+             A    B
+        0    0  200
+        1    1  300
+        2  102  400
+        3  103  500
+        4  104  600
+
+        >>> df1.mask(df1 > 1, df2).sort_index()
+           A    B
+        0  0 -100
+        1  1 -200
+        2 -2 -300
+        3 -3 -400
+        4 -4 -500
+
+        >>> reset_option("compute.ops_on_diff_frames")
+        """
+        from databricks.koalas.series import Series
+        if not isinstance(cond, (DataFrame, Series)):
+            raise ValueError("type of cond must be a DataFrame or Series")
+
+        sdf = cond._internal.sdf
+        for col in cond._internal.data_columns:
+            sdf = sdf.withColumn(col, ~F.col(col))
+
+        internal = self._internal.copy(
+            sdf=sdf,
+            column_scols=[scol_for(sdf, column) for column in self._internal.data_columns])
+        cond_inversed = DataFrame(internal)
+
+        return self.where(cond_inversed, other)
+
     @property
     def index(self):
         """The index (row labels) Column of the DataFrame.
diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -69,7 +69,6 @@ class _MissingPandasLikeDataFrame(object):
     last_valid_index = unsupported_function('last_valid_index')
     lookup = unsupported_function('lookup')
     mad = unsupported_function('mad')
-    mask = unsupported_function('mask')
     mode = unsupported_function('mode')
     pct_change = unsupported_function('pct_change')
     prod = unsupported_function('prod')
@@ -100,7 +99,6 @@ class _MissingPandasLikeDataFrame(object):
     tz_convert = unsupported_function('tz_convert')
     tz_localize = unsupported_function('tz_localize')
     unstack = unsupported_function('unstack')
-    where = unsupported_function('where')
 
     # Deprecated functions
     as_blocks = unsupported_function('as_blocks', deprecated=True)
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -2227,3 +2227,15 @@ def test_quantile(self):
 
         with self.assertRaisesRegex(ValueError, "quantile currently doesn't supports numeric_only"):
             kdf.quantile(.5, numeric_only=False)
+
+    def test_where(self):
+        kdf = ks.from_pandas(self.pdf)
+
+        with self.assertRaisesRegex(ValueError, 'type of cond must be a DataFrame or Series'):
+            kdf.where(1)
+
+    def test_mask(self):
+        kdf = ks.from_pandas(self.pdf)
+
+        with self.assertRaisesRegex(ValueError, 'type of cond must be a DataFrame or Series'):
+            kdf.mask(1)
diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -422,6 +422,40 @@ def test_loc_setitem(self):
 
         self.assert_eq(kdf.sort_index(), pdf.sort_index())
 
+    def test_where(self):
+        pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]})
+        pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(repr(pdf1.where(pdf2 > 100)),
+                       repr(kdf1.where(kdf2 > 100).sort_index()))
+
+        pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]})
+        pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(repr(pdf1.where(pdf2 < -250)),
+                       repr(kdf1.where(kdf2 < -250).sort_index()))
+
+    def test_mask(self):
+        pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]})
+        pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(repr(pdf1.mask(pdf2 < 100)),
+                       repr(kdf1.mask(kdf2 < 100).sort_index()))
+
+        pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]})
+        pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(repr(pdf1.mask(pdf2 > -250)),
+                       repr(kdf1.mask(kdf2 > -250).sort_index()))
+
     def test_multi_index_column_assignment_frame(self):
         pdf = pd.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]})
         pdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')])
@@ -493,3 +527,41 @@ def test_loc_setitem(self):
 
         with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
             kdf.loc[['viper', 'sidewinder'], ['shield']] = another_kdf.max_speed
+
+    def test_where(self):
+        pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]})
+        pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
+            self.assert_eq(repr(pdf1.where(pdf2 > 100)),
+                           repr(kdf1.where(kdf2 > 100).sort_index()))
+
+        pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]})
+        pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
+            self.assert_eq(repr(pdf1.where(pdf2 < -250)),
+                           repr(kdf1.where(kdf2 < -250).sort_index()))
+
+    def test_mask(self):
+        pdf1 = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [100, 200, 300, 400, 500]})
+        pdf2 = pd.DataFrame({'A': [0, -1, -2, -3, -4], 'B': [-100, -200, -300, -400, -500]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
+            self.assert_eq(repr(pdf1.mask(pdf2 < 100)),
+                           repr(kdf1.mask(kdf2 < 100).sort_index()))
+
+        pdf1 = pd.DataFrame({'A': [-1, -2, -3, -4, -5], 'B': [-100, -200, -300, -400, -500]})
+        pdf2 = pd.DataFrame({'A': [-10, -20, -30, -40, -50], 'B': [-5, -4, -3, -2, -1]})
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
+            self.assert_eq(repr(pdf1.mask(pdf2 > -250)),
+                           repr(kdf1.mask(kdf2 > -250).sort_index()))
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -58,6 +58,8 @@ Indexing, iteration
    DataFrame.keys
    DataFrame.xs
    DataFrame.get
+   DataFrame.where
+   DataFrame.mask
 
 Binary operator functions
 -------------------------