From 3bd42e3f6b51c69e7c73532630ed1369b57c82d8 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Wed, 2 May 2018 23:55:04 -0700 Subject: [PATCH 1/8] added diff method --- python/ray/dataframe/dataframe.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 98305f42a1b7..184056a8d676 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1356,9 +1356,30 @@ def describe_helper(df): return result def diff(self, periods=1, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Finds the difference between elements on the axis requested + + Args: + periods: Periods to shift for forming difference + axis: Take difference over rows or columns + + Returns: + DataFrame with the diff applied + """ + + if (axis == 1 or axis == 'columns'): + result = _map_partitions(lambda df: + df.diff(axis=axis, periods=periods), + self._row_partitions) + return DataFrame(row_partitions=result, + columns=self.columns, + index=self.index) + if (axis == 0 or axis == 'index'): + result = _map_partitions(lambda df: + df.diff(axis=axis, periods=periods), + self._col_partitions) + return DataFrame(col_partitions=result, + columns=self.columns, + index=self.index) def div(self, other, axis='columns', level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. @@ -1573,7 +1594,7 @@ def helper(df, index, other_series): # TODO: group series here into full df partitions to reduce # the number of remote calls to helper other_series = other_df.iloc[idx['index_within_partition']] - curr_index = self._row_metadata._coord_df.iloc[i] + curr_index = self._row_metadata._coord_df.loc[i] curr_df = self._row_partitions[int(curr_index['partition'])] results.append(_deploy_func.remote(helper, curr_df, From 0655d7b9c5e8570ccec35bdb13aa28023f3469bf Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Wed, 2 May 2018 23:58:54 -0700 Subject: [PATCH 2/8] sanity checks --- python/ray/dataframe/test/test_dataframe.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 42f267db3865..c863a97e7d6f 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -221,6 +221,7 @@ def test_int_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -380,6 +381,7 @@ def test_float_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -542,6 +544,7 @@ def test_mixed_dtype_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -695,6 +698,7 @@ def test_nan_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -1133,11 +1137,8 @@ def test_describe(ray_df, pandas_df): assert(ray_df.describe().equals(pandas_df.describe())) -def test_diff(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.diff() +def test_diff(ray_df, pandas_df): + assert(ray_df.diff().equals(pandas_df.diff())) def test_div(): From e727aaf1d0567e28eeb0864ba5084fe99e1b45c1 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Wed, 2 May 2018 23:59:52 -0700 Subject: [PATCH 3/8] flake8 --- python/ray/dataframe/dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 184056a8d676..bca1679cd87c 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1357,15 +1357,15 @@ def describe_helper(df): def diff(self, periods=1, axis=0): """Finds the difference between elements on the axis requested - + Args: periods: Periods to shift for forming difference axis: Take difference over rows or columns - + Returns: - DataFrame with the diff applied + DataFrame with the diff applied """ - + if (axis == 1 or axis == 'columns'): result = _map_partitions(lambda df: df.diff(axis=axis, periods=periods), From ff49d39e96db7493a37704f0463b4962d46fd07d Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Thu, 3 May 2018 13:34:40 -0700 Subject: [PATCH 4/8] updated sanity checks' --- python/ray/dataframe/test/test_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index c863a97e7d6f..cd78971580be 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -544,7 +544,6 @@ def test_mixed_dtype_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) - test_diff(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -1137,8 +1136,9 @@ def test_describe(ray_df, pandas_df): assert(ray_df.describe().equals(pandas_df.describe())) +@pytest.fixture def test_diff(ray_df, pandas_df): - assert(ray_df.diff().equals(pandas_df.diff())) + assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) def test_div(): From 2bbbcac1ee7d029e9b0fab769c72869288e54234 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Thu, 3 May 2018 21:25:57 -0700 Subject: [PATCH 5/8] rebase and style updates --- python/ray/dataframe/dataframe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index bca1679cd87c..cf3051fa9530 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1365,15 +1365,17 @@ def diff(self, periods=1, axis=0): Returns: DataFrame with the diff applied """ + axis = pd.DataFrame()._get_axis_number(axis) if axis is not None \ + else 0 - if (axis == 1 or axis == 'columns'): + if (axis == 1): result = _map_partitions(lambda df: df.diff(axis=axis, periods=periods), self._row_partitions) return DataFrame(row_partitions=result, columns=self.columns, index=self.index) - if (axis == 0 or axis == 'index'): + if (axis == 0): result = _map_partitions(lambda df: df.diff(axis=axis, periods=periods), self._col_partitions) From e94db65d8e007303bd56f05eafb4631096464096 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Fri, 4 May 2018 12:09:37 -0700 Subject: [PATCH 6/8] updated diff tests and cleaned up code --- python/ray/dataframe/dataframe.py | 15 +++++++-------- python/ray/dataframe/test/test_dataframe.py | 1 + 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index cf3051fa9530..aabdc08d07e3 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1365,20 +1365,19 @@ def diff(self, periods=1, axis=0): Returns: DataFrame with the diff applied """ - axis = pd.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 + axis = pd.DataFrame()._get_axis_number(axis) + partitions = (self._col_partitions if + axis == 0 else self._row_partitions) + + result = _map_partitions(lambda df: + df.diff(axis=axis, periods=periods), + partitions) if (axis == 1): - result = _map_partitions(lambda df: - df.diff(axis=axis, periods=periods), - self._row_partitions) return DataFrame(row_partitions=result, columns=self.columns, index=self.index) if (axis == 0): - result = _map_partitions(lambda df: - df.diff(axis=axis, periods=periods), - self._col_partitions) return DataFrame(col_partitions=result, columns=self.columns, index=self.index) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index cd78971580be..18a26a008ee1 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -1139,6 +1139,7 @@ def test_describe(ray_df, pandas_df): @pytest.fixture def test_diff(ray_df, pandas_df): assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) + assert(ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))) def test_div(): From b9fa2cc77e294cf877169277d665ea671861bea8 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Fri, 4 May 2018 12:15:04 -0700 Subject: [PATCH 7/8] updated tests for periods --- python/ray/dataframe/test/test_dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 18a26a008ee1..d1d98b78a439 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -1140,6 +1140,7 @@ def test_describe(ray_df, pandas_df): def test_diff(ray_df, pandas_df): assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) assert(ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))) + assert(ray_df_equals_pandas(ray_df.diff(periods=1), pandas_df.diff(periods=1))) def test_div(): From f2fd7093ef107cf5926f81daff3892c94e33e477 Mon Sep 17 00:00:00 2001 From: 11rohans <11rohans@gmail.com> Date: Fri, 4 May 2018 14:37:56 -0700 Subject: [PATCH 8/8] flake8 --- python/ray/dataframe/test/test_dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index d1d98b78a439..f0350e5af794 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -1140,7 +1140,8 @@ def test_describe(ray_df, pandas_df): def test_diff(ray_df, pandas_df): assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) assert(ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))) - assert(ray_df_equals_pandas(ray_df.diff(periods=1), pandas_df.diff(periods=1))) + assert(ray_df_equals_pandas(ray_df.diff(periods=1), + pandas_df.diff(periods=1))) def test_div():