Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable DataFrame setting value as list of labels. (Resolves #894) #905

Merged
merged 8 commits into from
Oct 13, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 33 additions & 17 deletions databricks/koalas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,14 @@ class LocIndexer(object):

**Setting values**

Setting value for all items matching the list of labels is not allowed
Setting value for all items matching the list of labels.

>>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
Traceback (most recent call last):
...
databricks.koalas.exceptions.SparkPandasNotImplementedError: ...
>>> df
max_speed shield
cobra 1 2
viper 4 50
sidewinder 7 50

Setting value for an entire row is not allowed

Expand All @@ -303,17 +305,17 @@ class LocIndexer(object):
>>> df
max_speed shield
cobra 30 2
viper 30 5
sidewinder 30 8
viper 30 50
sidewinder 30 50

Set value with Series

>>> df.loc[:, 'shield'] = df['shield'] * 2
>>> df
max_speed shield
cobra 30 4
viper 30 10
sidewinder 30 16
viper 30 100
sidewinder 30 100

**Getting values on a DataFrame with an index that has integer labels**

Expand Down Expand Up @@ -492,21 +494,35 @@ def __setitem__(self, key, value):
rows_sel, cols_sel = key

if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)):
raise SparkPandasNotImplementedError(
description="""Can only assign value to the whole dataframe, the row index
has to be `slice(None)` or `:`""",
pandas_function=".loc[..., ...] = ...",
spark_target_function="withColumn, select")

if not isinstance(cols_sel, str):
itholic marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("""only column names can be assigned""")
if isinstance(cols_sel, list):
kdf = self._kdf
itholic marked this conversation as resolved.
Show resolved Hide resolved
is_start = True
for col_sel in cols_sel:
if is_start:
sdf = kdf._sdf.withColumn(
col_sel,
(F.when(F.col(kdf._internal.index_columns[0]).isin(rows_sel), value)
.otherwise(F.col(col_sel))))
is_start = False
else:
sdf = sdf.withColumn(
col_sel,
(F.when(F.col(kdf._internal.index_columns[0]).isin(rows_sel), value)
.otherwise(F.col(col_sel))))
itholic marked this conversation as resolved.
Show resolved Hide resolved
self._kdf._internal = self._kdf._internal.copy(sdf=sdf)
else:
raise SparkPandasNotImplementedError(
description="""Can only assign value to the whole dataframe, the row index
has to be `slice(None)` or `:`""",
pandas_function=".loc[..., ...] = ...",
spark_target_function="withColumn, select")

if isinstance(value, DataFrame):
if len(value.columns) == 1:
self._kdf[cols_sel] = _col(value)
else:
raise ValueError("Only a dataframe with one column can be assigned")
else:
elif not isinstance(cols_sel, list):
itholic marked this conversation as resolved.
Show resolved Hide resolved
self._kdf[cols_sel] = value


Expand Down
27 changes: 27 additions & 0 deletions databricks/koalas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,33 @@ def test_iloc_series(self):
self.assert_eq(kseries.iloc[:1], pseries.iloc[:1])
self.assert_eq(kseries.iloc[:-1], pseries.iloc[:-1])

def test_setitem(self):
pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
index=['cobra', 'viper', 'sidewinder'],
columns=['max_speed', 'shield'])
kdf = ks.from_pandas(pdf)

pdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
kdf.loc[['viper', 'sidewinder'], ['shield', 'max_speed']] = 10
self.assert_eq(kdf, pdf)

with self.assertRaisesRegex(SparkPandasNotImplementedError,
'Can only assign value to the whole dataframe, the row index'):
kdf.loc[['viper', 'sidewinder'], 'shield'] = 10

with self.assertRaisesRegex(ValueError,
'Only a dataframe with one column can be assigned'):
kdf.loc[:, 'max_speed'] = kdf

pdf = pd.DataFrame([[1], [4], [7]],
index=['cobra', 'viper', 'sidewinder'],
columns=['max_speed'])
kdf = ks.from_pandas(pdf)

pdf.loc[:, 'max_speed'] = pdf
kdf.loc[:, 'max_speed'] = kdf
self.assert_eq(kdf, pdf)

def test_iloc_raises(self):
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
kdf = ks.from_pandas(pdf)
Expand Down