-
Notifications
You must be signed in to change notification settings - Fork 7.2k
[DataFrame] Implement Indexer getitem #1955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,18 +35,21 @@ | |
| _blocks_to_col, | ||
| _blocks_to_row, | ||
| _create_block_partitions, | ||
| _mask_block_partitions, | ||
| _inherit_docstrings, | ||
| _reindex_helper) | ||
| from . import get_npartitions | ||
| from .index_metadata import _IndexMetadata | ||
| from .indexing import _Loc_Indexer, _iLoc_Indexer | ||
|
|
||
|
|
||
| @_inherit_docstrings(pd.DataFrame) | ||
| class DataFrame(object): | ||
|
|
||
| def __init__(self, data=None, index=None, columns=None, dtype=None, | ||
| copy=False, col_partitions=None, row_partitions=None, | ||
| block_partitions=None, row_metadata=None, col_metadata=None): | ||
| block_partitions=None, row_metadata=None, col_metadata=None, | ||
| partial=False): | ||
| """Distributed DataFrame object backed by Pandas dataframes. | ||
|
|
||
| Args: | ||
|
|
@@ -69,7 +72,11 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, | |
| Metadata for the new dataframe's rows | ||
| col_metadata (_IndexMetadata): | ||
| Metadata for the new dataframe's columns | ||
| partial (boolean): | ||
| Internal: row_metadata and col_metadata only covers part of the | ||
| block partitions. (Used in index 'vew' accessor) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| """ | ||
| self.partial = partial | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| self._row_metadata = self._col_metadata = None | ||
|
|
||
| # Check type of data and use appropriate constructor | ||
|
|
@@ -93,6 +100,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, | |
| axis = 0 | ||
| columns = pd_df.columns | ||
| index = pd_df.index | ||
| self._row_metadata = self._col_metadata = None | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated (Remove this in favor of Line 73) |
||
| else: | ||
| # created this invariant to make sure we never have to go into the | ||
| # partitions to get the columns | ||
|
|
@@ -103,25 +111,28 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, | |
| if block_partitions is not None: | ||
| # put in numpy array here to make accesses easier since it's 2D | ||
| self._block_partitions = np.array(block_partitions) | ||
| if row_metadata is not None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this in two places? |
||
| self._row_metadata = row_metadata.copy() | ||
| if col_metadata is not None: | ||
| self._col_metadata = col_metadata.copy() | ||
| assert self._block_partitions.ndim == 2, \ | ||
| "Block Partitions must be 2D." | ||
| else: | ||
| if row_partitions is not None: | ||
| axis = 0 | ||
| partitions = row_partitions | ||
| if row_metadata is not None: | ||
| self._row_metadata = row_metadata.copy() | ||
| elif col_partitions is not None: | ||
| axis = 1 | ||
| partitions = col_partitions | ||
| if col_metadata is not None: | ||
| self._col_metadata = col_metadata.copy() | ||
|
|
||
| self._block_partitions = \ | ||
| _create_block_partitions(partitions, axis=axis, | ||
| length=len(columns)) | ||
|
|
||
| if row_metadata is not None: | ||
| self._row_metadata = row_metadata.copy() | ||
| if col_metadata is not None: | ||
| self._col_metadata = col_metadata.copy() | ||
|
|
||
| # Sometimes we only get a single column or row, which is | ||
| # problematic for building blocks from the partitions, so we | ||
| # add whatever dimension we're missing from the input. | ||
|
|
@@ -138,6 +149,21 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, | |
| self._col_metadata = _IndexMetadata(self._block_partitions[0, :], | ||
| index=columns, axis=1) | ||
|
|
||
| def _get_block_partitions(self): | ||
| if (self._row_metadata is None or self._col_metadata is None)\ | ||
| or (not self.partial): | ||
| return self._block_partitions_data | ||
| else: # is partial, need mask | ||
| oid_arr = _mask_block_partitions(self._block_partitions_data, | ||
| self._row_metadata, | ||
| self._col_metadata) | ||
| return oid_arr | ||
|
|
||
| def _set_block_partitions(self, new_block_partitions): | ||
| self._block_partitions_data = new_block_partitions | ||
|
|
||
| _block_partitions = property(_get_block_partitions, _set_block_partitions) | ||
|
|
||
| def _get_row_partitions(self): | ||
| return [_blocks_to_row.remote(*part) | ||
| for part in self._block_partitions] | ||
|
|
@@ -408,6 +434,10 @@ def dtypes(self): | |
| Returns: | ||
| The dtypes for this DataFrame. | ||
| """ | ||
| # Deal with empty column case | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer comment to say |
||
| if self.shape[1] == 0: | ||
| return pd.Series() | ||
|
|
||
| # The dtypes are common across all partitions. | ||
| # The first partition will be enough. | ||
| result = ray.get(_deploy_func.remote(lambda df: df.dtypes, | ||
|
|
@@ -3775,9 +3805,7 @@ def loc(self): | |
| We currently support: single label, list array, slice object | ||
| We do not support: boolean array, callable | ||
| """ | ||
| raise NotImplementedError( | ||
| "To contribute to Pandas on Ray, please visit " | ||
| "github.com/ray-project/ray.") | ||
| return _Loc_Indexer(self) | ||
|
|
||
| @property | ||
| def is_copy(self): | ||
|
|
@@ -3812,6 +3840,4 @@ def iloc(self): | |
| We currently support: single label, list array, slice object | ||
| We do not support: boolean array, callable | ||
| """ | ||
| raise NotImplementedError( | ||
| "To contribute to Pandas on Ray, please visit " | ||
| "github.com/ray-project/ray.") | ||
| return _iLoc_Indexer(self) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make more sense to have a
DataFrameViewobject as a subclass of this one?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I'm reading through this, I think it might make more sense to have it as a subclass. Changing the way that
_block_partitionsis handled in the case that it's a view makes complicated code more complicated.Alternatively, if we decide otherwise, I would still like to see more comments about the
_block_partitionschanges.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree we should have it as a subclass.
DataFrameViewobject make sense.I'm refactoring parts of my
indexing.pyto make enlargement inside the parent class of the_LocIndexerand_iLocIndexer. So that we have cleaner code inindexing.py