Skip to content

Commit 4e7e63d

Browse files
authored
fixing __delitem and initial pass at drop WIP (#3)
1 parent 26165b1 commit 4e7e63d

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

python/ray/dataframe/dataframe.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
8080
index,
8181
columns)
8282

83-
8483
# this _index object is a pd.DataFrame
8584
# and we use that DataFrame's Index to index the rows.
8685
self._row_lengths, self._row_index = \
@@ -305,7 +304,7 @@ def _set__col_index(self, new__index):
305304
def _compute_row_lengths(self):
306305
"""Updates the stored lengths of DataFrame partions
307306
"""
308-
self._row_lengths = [_deploy_func.remote(_get_row_lengths, d)
307+
self._row_lengths = [_deploy_func.remote(lambda df: len(df), d)
309308
for d in self._row_partitions]
310309

311310
def _get_row_lengths(self):
@@ -338,7 +337,7 @@ def _set_row_lengths(self, lengths):
338337
def _compute_col_lengths(self):
339338
"""Updates the stored lengths of DataFrame partions
340339
"""
341-
self._col_lengths = [_deploy_func.remote(_get_col_lengths, d)
340+
self._col_lengths = [_deploy_func.remote(lambda df: df.shape[1], d)
342341
for d in self._col_partitions]
343342

344343
def _get_col_lengths(self):
@@ -1171,16 +1170,25 @@ def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
11711170
try:
11721171
if not is_axis_zero or columns is not None:
11731172
values = labels if labels else columns
1173+
new_values = [self.columns.get_loc(i) for i in values]
11741174
new_df_rows = _map_partitions(
11751175
lambda df: df.drop(
1176-
values, axis=1, level=level, errors='ignore'),
1176+
new_values, axis=1, level=level, errors='ignore'),
11771177
self._row_partitions
11781178
)
1179-
new_columns = self.columns.to_series().drop(values,
1180-
errors=errors)
1181-
new_columns = pd.Index(new_columns)
1179+
new_columns = self._col_index.drop(values)
1180+
1181+
new_df_cols = self._col_partitions.copy()
1182+
col_parts_to_del = pd.Series(self._col_index.loc[values, 'partition']).unique()
1183+
for i in col_parts_to_del:
1184+
to_del = [self._col_index.loc[x, 'index_within_partition']
1185+
for x in values if self._col_index.loc[x, 'partition'] == i]
1186+
new_df_cols[i] = _deploy_func.remote(lambda df: df.drop(to_del), self._col_partitions[i])
1187+
1188+
11821189
new_df = DataFrame(columns=new_columns,
1183-
row_partitions=new_df_rows)
1190+
row_partitions=new_df_rows,
1191+
col_partitions=new_df_cols)
11841192
except (ValueError, KeyError):
11851193
if errors == 'raise':
11861194
raise
@@ -2986,9 +2994,13 @@ def __delitem__(self, key):
29862994
Args:
29872995
key: key to delete
29882996
"""
2997+
to_delete = self.columns.get_loc(key)
2998+
29892999
def del_helper(df):
2990-
df.__delitem__(self.columns.index(key))
3000+
df.__delitem__(to_delete)
3001+
df.reset_index(drop=True, inplace=True)
29913002
return df
3003+
29923004
self._row_partitions = _map_partitions(del_helper, self._row_partitions)
29933005

29943006
# TODO: See if this is faster than just:

0 commit comments

Comments
 (0)