From 12d030b2932c30e8c30f6732a3dfac05c6f85250 Mon Sep 17 00:00:00 2001 From: Peter Veerman Date: Thu, 3 May 2018 10:06:48 -0700 Subject: [PATCH 1/3] Implement df.as_matrix --- python/ray/dataframe/dataframe.py | 13 +++++++--- python/ray/dataframe/test/test_dataframe.py | 27 ++++++++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 5abd98f361bf..23cc43178e96 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1189,9 +1189,16 @@ def as_blocks(self, copy=True): "github.com/ray-project/ray.") def as_matrix(self, columns=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Convert the frame to its Numpy-array representation. + + Args: + columns: If None, return all columns, otherwise, + returns specified columns. + + Returns: + values: ndarray + """ + return to_pandas(self).as_matrix(columns) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 7660b5366447..0e7f821453ba 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -994,10 +994,31 @@ def test_as_blocks(): def test_as_matrix(): - ray_df = create_test_dataframe() + test_data = TestData() + frame = rdf.DataFrame(test_data.frame) + mat = frame.as_matrix() + + frameCols = frame.columns + for i, row in enumerate(mat): + for j, value in enumerate(row): + col = frameCols[j] + if np.isnan(value): + assert np.isnan(frame[col][i]) + else: + assert value == frame[col][i] - with pytest.raises(NotImplementedError): - ray_df.as_matrix() + # mixed type + mat = rdf.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) + assert mat[0, 0] == 'bar' + + df = rdf.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) + mat = df.as_matrix() + assert mat[0, 0] == 1j + + # single block corner case + mat = rdf.DataFrame(test_data.frame).as_matrix(['A', 'B']) + expected = test_data.frame.reindex(columns=['A', 'B']).values + tm.assert_almost_equal(mat, expected) def test_asfreq(): From 490b889d01e00379202010e0299001fcb055b55e Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 6 May 2018 11:48:58 -0700 Subject: [PATCH 2/3] Addressing comments --- python/ray/dataframe/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index 23cc43178e96..0aff85c7e3cc 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -1198,6 +1198,7 @@ def as_matrix(self, columns=None): Returns: values: ndarray """ + # TODO this is very inneficient, also see __array__ return to_pandas(self).as_matrix(columns) def asfreq(self, freq, method=None, how=None, normalize=False, @@ -4595,8 +4596,8 @@ def __round__(self, decimals=0): "github.com/ray-project/ray.") def __array__(self, dtype=None): - # TODO: This is very inefficient and needs fix - return np.array(to_pandas(self)) + # TODO: This is very inefficient and needs fix, also see as_matrix + return to_pandas(self).__array__(dtype=dtype) def __array_wrap__(self, result, context=None): raise NotImplementedError( From 6ec0bbae757f95a076b122962b187a8e7b38048a Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 6 May 2018 18:53:15 -0700 Subject: [PATCH 3/3] Addressing comments --- python/ray/dataframe/test/test_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 0e7f821453ba..81cf5882cb08 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -998,10 +998,10 @@ def test_as_matrix(): frame = rdf.DataFrame(test_data.frame) mat = frame.as_matrix() - frameCols = frame.columns + frame_columns = frame.columns for i, row in enumerate(mat): for j, value in enumerate(row): - col = frameCols[j] + col = frame_columns[j] if np.isnan(value): assert np.isnan(frame[col][i]) else: