Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable DataFrame setting value as list of labels. (Resolves #894) #905

Merged
merged 8 commits into from
Oct 13, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 44 additions & 17 deletions databricks/koalas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,14 @@ class LocIndexer(object):

**Setting values**

Setting value for all items matching the list of labels is not allowed
Setting value for all items matching the list of labels.

>>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
Traceback (most recent call last):
...
databricks.koalas.exceptions.SparkPandasNotImplementedError: ...
>>> df
max_speed shield
cobra 1 2
viper 4 50
sidewinder 7 50

Setting value for an entire row is not allowed

Expand All @@ -303,17 +305,26 @@ class LocIndexer(object):
>>> df
max_speed shield
cobra 30 2
viper 30 5
sidewinder 30 8
viper 30 50
sidewinder 30 50

Set value for an entire list of columns

>>> df.loc[:, ['max_speed', 'shield']] = 100
>>> df
max_speed shield
cobra 100 100
viper 100 100
sidewinder 100 100

Set value with Series

>>> df.loc[:, 'shield'] = df['shield'] * 2
>>> df
max_speed shield
cobra 30 4
viper 30 10
sidewinder 30 16
cobra 100 200
viper 100 200
sidewinder 100 200

**Getting values on a DataFrame with an index that has integer labels**

Expand Down Expand Up @@ -492,22 +503,38 @@ def __setitem__(self, key, value):
rows_sel, cols_sel = key

if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
raise SparkPandasNotImplementedError(
description="""Can only assign value to the whole dataframe, the row index
has to be `slice(None)` or `:`""",
pandas_function=".loc[..., ...] = ...",
spark_target_function="withColumn, select")
if isinstance(rows_sel, list):
if isinstance(cols_sel, str):
cols_sel = [cols_sel]
kdf = self._kdf
itholic marked this conversation as resolved.
Show resolved Hide resolved
sdf = kdf._sdf
for col_sel in cols_sel:
sdf = sdf.withColumn(
col_sel,
(F.when(F.col(kdf._internal.index_columns[0]).isin(rows_sel), value)
.otherwise(F.col(col_sel))))
itholic marked this conversation as resolved.
Show resolved Hide resolved
self._kdf._internal = self._kdf._internal.copy(sdf=sdf)
else:
raise SparkPandasNotImplementedError(
description="""Can only assign value to the whole dataframe, the row index
has to be `slice(None)` or `:`""",
pandas_function=".loc[..., ...] = ...",
spark_target_function="withColumn, select")

if not isinstance(cols_sel, str):
itholic marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("""only column names can be assigned""")
if not isinstance(cols_sel, (str, list)):
raise ValueError("""only column names or list of column names can be assigned""")

if isinstance(value, DataFrame):
if len(value.columns) == 1:
self._kdf[cols_sel] = _col(value)
else:
raise ValueError("Only a dataframe with one column can be assigned")
else:
self._kdf[cols_sel] = value
if isinstance(cols_sel, str):
cols_sel = [cols_sel]
if (not isinstance(rows_sel, list)) and (isinstance(cols_sel, list)):
for col_sel in cols_sel:
self._kdf[col_sel] = value


class ILocIndexer(object):
Expand Down
27 changes: 27 additions & 0 deletions databricks/koalas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,33 @@ def test_iloc_series(self):
self.assert_eq(kseries.iloc[:1], pseries.iloc[:1])
self.assert_eq(kseries.iloc[:-1], pseries.iloc[:-1])

def test_setitem(self):
pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
index=['cobra', 'viper', 'sidewinder'],
columns=['max_speed', 'shield'])
kdf = ks.from_pandas(pdf)

pdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
kdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
self.assert_eq(kdf, pdf)

pdf.loc[['viper', 'sidewinder'], 'shield'] = 50
kdf.loc[['viper', 'sidewinder'], 'shield'] = 50
self.assert_eq(kdf, pdf)

with self.assertRaisesRegex(ValueError,
'Only a dataframe with one column can be assigned'):
kdf.loc[:, 'max_speed'] = kdf

pdf = pd.DataFrame([[1], [4], [7]],
index=['cobra', 'viper', 'sidewinder'],
columns=['max_speed'])
kdf = ks.from_pandas(pdf)

pdf.loc[:, 'max_speed'] = pdf
kdf.loc[:, 'max_speed'] = kdf
self.assert_eq(kdf, pdf)

def test_iloc_raises(self):
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
kdf = ks.from_pandas(pdf)
Expand Down