Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Series.update #923

Merged
merged 11 commits into from
Nov 7, 2019
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
@@ -111,7 +111,6 @@ class _MissingPandasLikeSeries(object):
tz_convert = unsupported_function('tz_convert')
tz_localize = unsupported_function('tz_localize')
unstack = unsupported_function('unstack')
update = unsupported_function('update')
view = unsupported_function('view')
where = unsupported_function('where')
xs = unsupported_function('xs')
91 changes: 90 additions & 1 deletion databricks/koalas/series.py
Original file line number Diff line number Diff line change
@@ -42,7 +42,7 @@
from databricks.koalas.internal import IndexMap, _InternalFrame, SPARK_INDEX_NAME_FORMAT
from databricks.koalas.missing.series import _MissingPandasLikeSeries
from databricks.koalas.plot import KoalasSeriesPlotMethods
from databricks.koalas.utils import validate_arguments_and_invoke_function, scol_for
from databricks.koalas.utils import validate_arguments_and_invoke_function, scol_for, combine_frames
from databricks.koalas.datetimes import DatetimeMethods
from databricks.koalas.strings import StringMethods

@@ -3405,6 +3405,95 @@ def replace(self, to_replace=None, value=None, regex=False) -> 'Series':

return self._with_new_scol(current)

def update(self, other):
"""
Modify Series in place using non-NA values from passed Series. Aligns on index.

Parameters
----------
other : Series

Examples
--------
>>> from databricks.koalas.config import set_option, reset_option
>>> set_option("compute.ops_on_diff_frames", True)
>>> s = ks.Series([1, 2, 3])
>>> s.update(ks.Series([4, 5, 6]))
>>> s
0 4
1 5
2 6
Name: 0, dtype: int64

>>> s = ks.Series(['a', 'b', 'c'])
>>> s.update(ks.Series(['d', 'e'], index=[0, 2]))
>>> s
0 d
1 b
2 e
Name: 0, dtype: object

>>> s = ks.Series([1, 2, 3])
>>> s.update(ks.Series([4, 5, 6, 7, 8]))
>>> s
0 4
1 5
2 6
Name: 0, dtype: int64

>>> s = ks.Series([1, 2, 3], index=[10, 11, 12])
>>> s
10 1
11 2
12 3
Name: 0, dtype: int64

>>> s.update(ks.Series([4, 5, 6]))
>>> s
10 1
12 3
11 2
Name: 0, dtype: int64

>>> s.update(ks.Series([4, 5, 6], index=[11, 12, 13]))
>>> s
10 1
12 5
11 4
Name: 0, dtype: int64

If ``other`` contains NaNs the corresponding values are not updated
in the original Series.

>>> s = ks.Series([1, 2, 3])
>>> s.update(ks.Series([4, np.nan, 6]))
>>> s
0 4.0
1 2.0
2 6.0
Name: 0, dtype: float64

>>> reset_option("compute.ops_on_diff_frames")
"""
if not isinstance(other, Series):
raise ValueError("'other' must be a Series")

index_scol_names = [index_map[0] for index_map in self._internal.index_map]
combined = combine_frames(self.to_frame(), other.to_frame(), how='leftouter')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can:

kdf = self.to_frame()
kdf[other_column_name] = other
that_column = scol_for(kdf, other.name)
this_column = scol_for(kdf, self.name)

cond = F.when(
    that_column.isNotNull(), that_column
).otherwise(this_column).alias(...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HyukjinKwon Thanks for reviewing! what if other.name and self.name are same?? i think maybe kdf[other_column_name] = other will overwrite existing column, wouldn't it??

combined_sdf = combined._sdf
this_col = "__this_%s" % str(self.name)
that_col = "__that_%s" % str(other.name)
cond = F.when(combined_sdf[that_col].isNotNull(), combined_sdf[that_col]) \
.otherwise(combined_sdf[this_col]) \
.alias(str(self.name))
internal = _InternalFrame(
sdf=combined_sdf.select(index_scol_names + [cond]),
index_map=self._internal.index_map,
column_index=self._internal.column_index)
self_updated = _col(ks.DataFrame(internal))
self._internal = self_updated._internal
self._kdf = self_updated._kdf

def _cum(self, func, skipna, part_cols=()):
# This is used to cummin, cummax, cumsum, etc.
index_columns = self._internal.index_columns
8 changes: 8 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
@@ -728,3 +728,11 @@ def test_duplicates(self):

self.assert_eq(pser.drop_duplicates().sort_values(),
kser.drop_duplicates().sort_values())

def test_update(self):
pser = pd.Series([10, 20, 15, 30, 45], name='x')
kser = ks.Series(pser)

msg = "'other' must be a Series"
with self.assertRaisesRegex(ValueError, msg):
kser.update(10)
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
@@ -176,6 +176,7 @@ Combining / joining / merging

Series.append
Series.replace
Series.update

Time series-related
-------------------