From 2ba513c2f74c629c5a0df9a799f75b851f55ecf5 Mon Sep 17 00:00:00 2001 From: William Ma Date: Sun, 9 Sep 2018 22:49:18 -0700 Subject: [PATCH 1/5] Updated to_datetime docstring --- modin/pandas/datetimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/pandas/datetimes.py b/modin/pandas/datetimes.py index a3e293b2fbc..03dbe1cf2af 100644 --- a/modin/pandas/datetimes.py +++ b/modin/pandas/datetimes.py @@ -26,6 +26,7 @@ def to_datetime(arg, Args: errors ('raise' or 'ignore'): If 'ignore', errors are silenced. + Pandas blatantly ignores this argument so we will too. dayfirst (bool): Date format is passed in as day first. yearfirst (bool): Date format is passed in as year first. utc (bool): retuns a UTC DatetimeIndex if True. From 64cd25c03f1cfae80d98e1ee6e5db654b09bb75a Mon Sep 17 00:00:00 2001 From: William Ma Date: Wed, 12 Sep 2018 14:51:53 -0700 Subject: [PATCH 2/5] Updated astype tests --- modin/pandas/test/test_dataframe.py | 32 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index c71b69cb229..01fe9657610 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -1120,30 +1120,28 @@ def test_assign(): def test_astype(): td = TestData() - ray_df = pd.DataFrame(td.frame) - our_df_casted = ray_df.astype(np.int32) - expected_df_casted = pandas.DataFrame( - td.frame.values.astype(np.int32), + ray_df = pd.DataFrame(td.frame.values, + index=td.frame.index, + columns=td.frame.columns) + expected_df = pandas.DataFrame( + td.frame.values, index=td.frame.index, columns=td.frame.columns) - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) + ray_df_casted = ray_df.astype(np.int32) + expected_df_casted = expected_df.astype(np.int32) - our_df_casted = ray_df.astype(np.float64) - expected_df_casted = pandas.DataFrame( - td.frame.values.astype(np.float64), - index=td.frame.index, - columns=td.frame.columns) + assert ray_df_equals_pandas(ray_df_casted, expected_df_casted) - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) + ray_df_casted = ray_df.astype(np.float64) + expected_df_casted = expected_df.astype(np.float64) - our_df_casted = ray_df.astype(str) - expected_df_casted = pandas.DataFrame( - td.frame.values.astype(str), - index=td.frame.index, - columns=td.frame.columns) + assert ray_df_equals_pandas(ray_df_casted, expected_df_casted) + + ray_df_casted = ray_df.astype(str) + expected_df_casted = expected_df.astype(str) - assert ray_df_equals_pandas(our_df_casted, expected_df_casted) + assert ray_df_equals_pandas(ray_df_casted, expected_df_casted) def test_at_time(): From edfd8c250d74991eb4a7f5fc47ec937c0eeb0269 Mon Sep 17 00:00:00 2001 From: William Ma Date: Wed, 12 Sep 2018 19:04:28 -0700 Subject: [PATCH 3/5] Commented out loc and iloc tests --- modin/pandas/test/test_dataframe.py | 47 ++++++++++++++--------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 01fe9657610..d5598629d4b 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -317,8 +317,8 @@ def test_float_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - test_loc(ray_df, pandas_df) - test_iloc(ray_df, pandas_df) + #test_loc(ray_df, pandas_df) + #test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -500,8 +500,8 @@ def test_mixed_dtype_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - test_loc(ray_df, pandas_df) - test_iloc(ray_df, pandas_df) + #test_loc(ray_df, pandas_df) + #test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -652,8 +652,8 @@ def test_nan_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - test_loc(ray_df, pandas_df) - test_iloc(ray_df, pandas_df) + #test_loc(ray_df, pandas_df) + #test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -1534,7 +1534,20 @@ def test_eval_df_use_case(): df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - # Very hacky test to test eval while inplace is not working + # test eval for series results + tmp_pandas = df.eval( + "arctan2(sin(a), b)", + engine='python', + parser='pandas') + tmp_ray = ray_df.eval( + "arctan2(sin(a), b)", + engine='python', + parser='pandas') + + assert isinstance(tmp_ray, pandas.Series) + assert ray_series_equals_pandas(tmp_ray, tmp_pandas) + + # Test not inplace assignments tmp_pandas = df.eval( "e = arctan2(sin(a), b)", engine='python', @@ -1545,6 +1558,7 @@ def test_eval_df_use_case(): parser='pandas') assert ray_df_equals_pandas(tmp_ray, tmp_pandas) + # Test inplace assignments df.eval( "e = arctan2(sin(a), b)", engine='python', @@ -1559,6 +1573,7 @@ def test_eval_df_use_case(): assert ray_df_equals_pandas(ray_df, df) + def test_eval_df_arithmetic_subexpression(): frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} df = pandas.DataFrame(frame_data) @@ -1571,24 +1586,6 @@ def test_eval_df_arithmetic_subexpression(): assert ray_df_equals_pandas(ray_df, df) -def test_eval_df_series_result(): - frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} - df = pandas.DataFrame(frame_data) - ray_df = pd.DataFrame(frame_data) - - # Very hacky test to test eval while inplace is not working - tmp_pandas = df.eval( - "arctan2(sin(a), b)", - engine='python', - parser='pandas') - tmp_ray = ray_df.eval( - "arctan2(sin(a), b)", - engine='python', - parser='pandas') - assert ray_df_equals_pandas(tmp_ray, tmp_pandas) - assert isinstance(to_pandas(tmp_ray), pandas.Series) - - def test_ewm(): ray_df = create_test_dataframe() From cb43f81dd32138d2757cc86559a991efcc6f21af Mon Sep 17 00:00:00 2001 From: William Ma Date: Wed, 12 Sep 2018 19:07:47 -0700 Subject: [PATCH 4/5] Updated eval --- modin/data_management/data_manager.py | 37 +++++++++++++++++---------- modin/pandas/dataframe.py | 11 +++++--- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py index e1c8ce65e03..59b4cdd4716 100644 --- a/modin/data_management/data_manager.py +++ b/modin/data_management/data_manager.py @@ -815,30 +815,41 @@ def query_builder(df, **kwargs): def eval(self, expr, **kwargs): cls = type(self) - columns = self.columns + inplace = kwargs.get("inplace", False) + + columns = self.index if self._is_transposed else self.columns + index = self.columns if self._is_transposed else self.index + + # Dun eval on columns to determine result type + columns_copy = pandas.DataFrame(columns=self.columns) + columns_copy = columns_copy.eval(expr, inplace=False, **kwargs) + expect_series = isinstance(columns_copy, pandas.Series) + + # if there is no assignment, then we simply save the results + # in the first column + if expect_series: + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + else: + expr = "{0} = {1}".format(columns[0], expr) def eval_builder(df, **kwargs): df.columns = columns result = df.eval(expr, inplace=False, **kwargs) - # If result is a series, expr was not an assignment expression. - if not isinstance(result, pandas.Series): - result.columns = pandas.RangeIndex(0, len(result.columns)) + result.columns = pandas.RangeIndex(0, len(result.columns)) return result func = self._prepare_method(eval_builder, **kwargs) new_data = self.map_across_full_axis(1, func) - # eval can update the columns, so we must update columns - columns_copy = pandas.DataFrame(columns=columns) - columns_copy = columns_copy.eval(expr, inplace=False, **kwargs) - if isinstance(columns_copy, pandas.Series): - # To create a data manager, we need the - # columns to be in a list-like - columns = list(columns_copy.name) + if expect_series: + result = new_data.to_pandas()[0] + result.name = columns_copy.name + result.index = index + return result else: columns = columns_copy.columns - - return cls(new_data, self.index, columns) + return cls(new_data, self.index, columns) def quantile_for_list_of_values(self, **kwargs): cls = type(self) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index ff97c95bb98..6b1ba67516f 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1344,12 +1344,15 @@ def eval(self, expr, inplace=False, **kwargs): self._validate_eval_query(expr, **kwargs) inplace = validate_bool_kwarg(inplace, "inplace") - data_manager = self._data_manager.eval(expr, **kwargs) + result = self._data_manager.eval(expr, **kwargs) - if inplace: - self._update_inplace(new_manager=data_manager) + if isinstance(result, pandas.Series): + return result else: - return DataFrame(data_manager=data_manager) + if inplace: + self._update_inplace(new_manager=result) + else: + return DataFrame(data_manager=result) def ewm(self, com=None, From d731edebf9b0b429bbd35424fb2a558d285d8431 Mon Sep 17 00:00:00 2001 From: William Ma Date: Thu, 13 Sep 2018 21:44:02 -0700 Subject: [PATCH 5/5] removed empty space and uncommented test_loc and test_iloc --- modin/pandas/test/test_dataframe.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index d5598629d4b..575ec3b9c8f 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -146,8 +146,8 @@ def test_int_dataframe(): test_cumsum(ray_df, pandas_df) test_pipe(ray_df, pandas_df) - # test_loc(ray_df, pandas_df) - # test_iloc(ray_df, pandas_df) + test_loc(ray_df, pandas_df) + test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -317,8 +317,8 @@ def test_float_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - #test_loc(ray_df, pandas_df) - #test_iloc(ray_df, pandas_df) + test_loc(ray_df, pandas_df) + test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -500,8 +500,8 @@ def test_mixed_dtype_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - #test_loc(ray_df, pandas_df) - #test_iloc(ray_df, pandas_df) + test_loc(ray_df, pandas_df) + test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -652,8 +652,8 @@ def test_nan_dataframe(): test_iteritems(ray_df, pandas_df) test_itertuples(ray_df, pandas_df) - #test_loc(ray_df, pandas_df) - #test_iloc(ray_df, pandas_df) + test_loc(ray_df, pandas_df) + test_iloc(ray_df, pandas_df) labels = ['a', 'b', 'c', 'd'] test_set_axis(ray_df, pandas_df, labels, 0) @@ -1573,7 +1573,6 @@ def test_eval_df_use_case(): assert ray_df_equals_pandas(ray_df, df) - def test_eval_df_arithmetic_subexpression(): frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} df = pandas.DataFrame(frame_data)