From 9c4e2b1e38035f2dae006d426a6a025a215509fe Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Wed, 2 May 2018 18:04:56 -0700 Subject: [PATCH 1/6] Implement multiple axis for dropna --- python/ray/dataframe/dataframe.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index dd73c0760bbb..803d50e67f16 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -775,10 +775,24 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, If inplace is set to True, returns None, otherwise returns a new DataFrame with the dropna applied. """ + dummy_frame = pd.DataFrame(index=self.index, columns=self.columns) + dummy_frame.dropna(axis=axis, how=how, thresh=thresh, + subset=subset, inplace=inplace) if is_list_like(axis): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + axis = set([dummy_frame._get_axis_number(ax) for ax in axis]) + result = self + for ax in axis: + result = result.dropna( + axis=ax, how=how, thresh=thresh, subset=subset) + if not inplace: + return result + + return self._update_inplace( + row_partitions=result._row_partitions, + col_partitions=result._col_partitions, + columns=result._col_metadata.index, + index=result._row_metadata.index + ) axis = pd.DataFrame()._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") From b6403f44f90fedd5bb5479b8a78894cb1fccd84b Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 19:02:47 -0700 Subject: [PATCH 2/6] Add multiple axis dropna test --- python/ray/dataframe/dataframe.py | 3 -- python/ray/dataframe/test/test_dataframe.py | 41 ++++++++++++++++----- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 803d50e67f16..6c227732f602 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -775,9 +775,6 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, If inplace is set to True, returns None, otherwise returns a new DataFrame with the dropna applied. """ - dummy_frame = pd.DataFrame(index=self.index, columns=self.columns) - dummy_frame.dropna(axis=axis, how=how, thresh=thresh, - subset=subset, inplace=inplace) if is_list_like(axis): axis = set([dummy_frame._get_axis_number(ax) for ax in axis]) result = self diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index fa29762c2b06..00527f05a315 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -839,6 +839,7 @@ def test_dense_nan_df(): test_dropna(ray_df, pd_df) test_dropna_inplace(ray_df, pd_df) + test_dropna_multiple_axes(ray_df, pd_df) @pytest.fixture @@ -1297,16 +1298,17 @@ def test_drop_duplicates(): @pytest.fixture def test_dropna(ray_df, pd_df): - ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'), - pd_df.dropna(axis=1, how='all')) + assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'), + pd_df.dropna(axis=1, how='all')) - ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'), - pd_df.dropna(axis=1, how='any')) + assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'), + pd_df.dropna(axis=1, how='any')) - ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'), - pd_df.dropna(axis=0, how='all')) + assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'), + pd_df.dropna(axis=0, how='all')) - ray_df_equals_pandas(ray_df.dropna(thresh=2), pd_df.dropna(thresh=2)) + assert ray_df_equals_pandas(ray_df.dropna(thresh=2), + pd_df.dropna(thresh=2)) @pytest.fixture @@ -1317,12 +1319,33 @@ def test_dropna_inplace(ray_df, pd_df): ray_df.dropna(thresh=2, inplace=True) pd_df.dropna(thresh=2, inplace=True) - ray_df_equals_pandas(ray_df, pd_df) + assert ray_df_equals_pandas(ray_df, pd_df) ray_df.dropna(axis=1, how='any', inplace=True) pd_df.dropna(axis=1, how='any', inplace=True) - ray_df_equals_pandas(ray_df, pd_df) + assert ray_df_equals_pandas(ray_df, pd_df) + + +@pytest.fixture +def test_dropna_multiple_axes(ray_df, pd_df): + ray_df = ray_df.copy() + pd_df = pd_df.copy() + cp = ray_df.copy() + result = ray_df.dropna(how='all', axis=[0, 1]) + result2 = ray_df.dropna(how='all', axis=(0, 1)) + expected = pd_df.dropna(how='all').dropna(how='all', axis=1) + + assert ray_df_equals_pandas(result, expected) + assert ray_df_equals_pandas(result2, expected) + + assert ray_df_equals_pandas(result, expected) + assert ray_df_equals_pandas(result2, expected) + assert ray_df_equals(ray_df, cp) + + inp = ray_df.copy() + inp.dropna(how='all', axis=(0, 1), inplace=True) + assert ray_df_equals_pandas(inp, expected) def test_duplicated(): From 666860ceebb7a626b0d6f147b6d8831a26345ba1 Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 19:05:01 -0700 Subject: [PATCH 3/6] Fix using dummy_frame in dropna --- python/ray/dataframe/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 6c227732f602..f5d9929f1c74 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -776,7 +776,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, DataFrame with the dropna applied. """ if is_list_like(axis): - axis = set([dummy_frame._get_axis_number(ax) for ax in axis]) + axis = set([pd.DataFrame()._get_axis_number(ax) for ax in axis]) result = self for ax in axis: result = result.dropna( From 582f62854cf198576089f7bb3a94b8074951f807 Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 19:26:55 -0700 Subject: [PATCH 4/6] Clean up dropna multiple axis tests --- python/ray/dataframe/dataframe.py | 7 ++--- python/ray/dataframe/test/test_dataframe.py | 32 +++++++++++++-------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index f5d9929f1c74..4d0cc8426028 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -776,17 +776,16 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, DataFrame with the dropna applied. """ if is_list_like(axis): - axis = set([pd.DataFrame()._get_axis_number(ax) for ax in axis]) + axis = [pd.DataFrame()._get_axis_number(ax) for ax in axis] result = self - for ax in axis: + for ax in axis: # TODO: inefficient, df built as intermediate result = result.dropna( axis=ax, how=how, thresh=thresh, subset=subset) if not inplace: return result return self._update_inplace( - row_partitions=result._row_partitions, - col_partitions=result._col_partitions, + block_partitions=result._block_partitions, columns=result._col_metadata.index, index=result._row_metadata.index ) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 00527f05a315..2d47a07a2104 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -840,6 +840,7 @@ def test_dense_nan_df(): test_dropna(ray_df, pd_df) test_dropna_inplace(ray_df, pd_df) test_dropna_multiple_axes(ray_df, pd_df) + test_dropna_multiple_axes_inplace(ray_df, pd_df) @pytest.fixture @@ -1329,23 +1330,30 @@ def test_dropna_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_multiple_axes(ray_df, pd_df): + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=[0, 1]), + pd_df.dropna(how='all', axis=[0, 1]) + ) + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=(0, 1)), + pd_df.dropna(how='all', axis=(0, 1)) + ) + + +@pytest.fixture +def test_dropna_multiple_axes_inplace(ray_df, pd_df): ray_df = ray_df.copy() pd_df = pd_df.copy() - cp = ray_df.copy() - result = ray_df.dropna(how='all', axis=[0, 1]) - result2 = ray_df.dropna(how='all', axis=(0, 1)) - expected = pd_df.dropna(how='all').dropna(how='all', axis=1) - assert ray_df_equals_pandas(result, expected) - assert ray_df_equals_pandas(result2, expected) + ray_df.dropna(how='all', axis=[0, 1], inplace=True) + pd_df.dropna(how='all', axis=[0, 1], inplace=True) - assert ray_df_equals_pandas(result, expected) - assert ray_df_equals_pandas(result2, expected) - assert ray_df_equals(ray_df, cp) + assert ray_df_equals_pandas(ray_df, pd_df) - inp = ray_df.copy() - inp.dropna(how='all', axis=(0, 1), inplace=True) - assert ray_df_equals_pandas(inp, expected) + ray_df.dropna(how='all', axis=(0, 1), inplace=True) + pd_df.dropna(how='all', axis=(0, 1), inplace=True) + + assert ray_df_equals_pandas(ray_df, pd_df) def test_duplicated(): From f9e14e7b92478a56c9c41480c310635903de28d4 Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 19:28:03 -0700 Subject: [PATCH 5/6] remove unnecessary axis modification --- python/ray/dataframe/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 4d0cc8426028..6f673ee67a56 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -776,7 +776,6 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, DataFrame with the dropna applied. """ if is_list_like(axis): - axis = [pd.DataFrame()._get_axis_number(ax) for ax in axis] result = self for ax in axis: # TODO: inefficient, df built as intermediate result = result.dropna( From fed275ea45f217e60dfac4b9f5b268c91559d470 Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 19:38:15 -0700 Subject: [PATCH 6/6] Clean up dropna tests --- python/ray/dataframe/dataframe.py | 8 +++++--- python/ray/dataframe/test/test_dataframe.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 6f673ee67a56..61cf4a6d8e34 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -777,7 +777,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, """ if is_list_like(axis): result = self - for ax in axis: # TODO: inefficient, df built as intermediate + # TODO(kunalgosar): this builds an intermediate dataframe, + # which does unnecessary computation + for ax in axis: result = result.dropna( axis=ax, how=how, thresh=thresh, subset=subset) if not inplace: @@ -785,8 +787,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, return self._update_inplace( block_partitions=result._block_partitions, - columns=result._col_metadata.index, - index=result._row_metadata.index + columns=result.columns, + index=result.index ) axis = pd.DataFrame()._get_axis_number(axis) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 2d47a07a2104..51698e392b35 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -1342,18 +1342,21 @@ def test_dropna_multiple_axes(ray_df, pd_df): @pytest.fixture def test_dropna_multiple_axes_inplace(ray_df, pd_df): - ray_df = ray_df.copy() - pd_df = pd_df.copy() + ray_df_copy = ray_df.copy() + pd_df_copy = pd_df.copy() - ray_df.dropna(how='all', axis=[0, 1], inplace=True) - pd_df.dropna(how='all', axis=[0, 1], inplace=True) + ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True) + pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True) - assert ray_df_equals_pandas(ray_df, pd_df) + assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) - ray_df.dropna(how='all', axis=(0, 1), inplace=True) - pd_df.dropna(how='all', axis=(0, 1), inplace=True) + ray_df_copy = ray_df.copy() + pd_df_copy = pd_df.copy() - assert ray_df_equals_pandas(ray_df, pd_df) + ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True) + pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True) + + assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) def test_duplicated():