diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 561111677ee8..e00cb09732cd 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -806,17 +806,22 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if how is None and thresh is None: raise TypeError('must specify how or thresh') + indices = None if subset is not None: - subset = set(subset) - if axis == 1: - subset = [item for item in self.index if item in subset] + indices = self.index.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) else: - subset = [item for item in self.columns if item in subset] + indices = self.columns.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) def dropna_helper(df): new_df = df.dropna(axis=axis, how=how, thresh=thresh, - subset=subset, inplace=False) + subset=indices, inplace=False) if axis == 1: new_index = new_df.columns diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 1fa63465d87a..88b1ec1c080c 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -842,10 +842,15 @@ def test_dense_nan_df(): [np.nan, np.nan, np.nan, 5]], columns=list('ABCD')) + column_subsets = [list('AD'), list('BC'), list('CD')] + row_subsets = [[0, 1], [0, 1, 2], [2, 0]] + test_dropna(ray_df, pd_df) test_dropna_inplace(ray_df, pd_df) test_dropna_multiple_axes(ray_df, pd_df) test_dropna_multiple_axes_inplace(ray_df, pd_df) + test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets) + test_dropna_subset_error(ray_df) @pytest.fixture @@ -1402,6 +1407,40 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) +@pytest.fixture +def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): + for subset in column_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', subset=subset), + pd_df.dropna(how='all', subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', subset=subset), + pd_df.dropna(how='any', subset=subset) + ) + + for subset in row_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=1, subset=subset), + pd_df.dropna(how='all', axis=1, subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', axis=1, subset=subset), + pd_df.dropna(how='any', axis=1, subset=subset) + ) + + +@pytest.fixture +def test_dropna_subset_error(ray_df): + with pytest.raises(KeyError): + ray_df.dropna(subset=list('EF')) + + with pytest.raises(KeyError): + ray_df.dropna(axis=1, subset=[4, 5]) + + def test_duplicated(): ray_df = create_test_dataframe()