Skip to content

Commit 1a34aeb

Browse files
committed
Implement groupby.expanding.sum, min, max, mean in Series and DataFrame
1 parent 7a711d6 commit 1a34aeb

File tree

2 files changed

+267
-27
lines changed

2 files changed

+267
-27
lines changed

databricks/koalas/tests/test_expanding.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
#
1717
import pandas as pd
1818

19-
import pandas as pd
20-
2119
import databricks.koalas as ks
2220
from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils
2321
from databricks.koalas.window import Expanding
@@ -106,3 +104,15 @@ def _test_groupby_expanding_func(self, f):
106104

107105
def test_groupby_expanding_count(self):
108106
self._test_expanding_func("count")
107+
108+
def test_groupby_expanding_min(self):
109+
self._test_expanding_func("min")
110+
111+
def test_groupby_expanding_max(self):
112+
self._test_expanding_func("max")
113+
114+
def test_groupby_expanding_mean(self):
115+
self._test_expanding_func("mean")
116+
117+
def test_groupby_expanding_sum(self):
118+
self._test_expanding_func("sum")

databricks/koalas/window.py

+255-25
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#
1616
from functools import partial
1717
from typing import Any
18-
from functools import reduce
1918

2019
from databricks.koalas.internal import _InternalFrame, SPARK_INDEX_NAME_FORMAT
2120
from databricks.koalas.utils import name_like_string
@@ -108,8 +107,9 @@ def count(self):
108107
109108
See Also
110109
--------
111-
Series.rolling : Calling object with Series data.
112-
DataFrame.rolling : Calling object with DataFrames.
110+
Series.expanding : Calling object with Series data.
111+
DataFrame.expanding : Calling object with DataFrames.
112+
Series.count : Count of the full Series.
113113
DataFrame.count : Count of the full DataFrame.
114114
115115
Examples
@@ -237,6 +237,7 @@ def count(self):
237237
--------
238238
Series.expanding : Calling object with Series data.
239239
DataFrame.expanding : Calling object with DataFrames.
240+
Series.count : Count of the full Series.
240241
DataFrame.count : Count of the full DataFrame.
241242
242243
Examples
@@ -283,6 +284,8 @@ def sum(self):
283284
284285
See Also
285286
--------
287+
Series.expanding : Calling object with Series data.
288+
DataFrame.expanding : Calling object with DataFrames.
286289
Series.sum : Reducing sum for Series.
287290
DataFrame.sum : Reducing sum for DataFrame.
288291
@@ -391,8 +394,10 @@ def max(self):
391394
392395
See Also
393396
--------
394-
Series.expanding : Series expanding.
395-
DataFrame.expanding : DataFrame expanding.
397+
Series.expanding : Calling object with Series data.
398+
DataFrame.expanding : Calling object with DataFrames.
399+
Series.max : Similar method for Series.
400+
DataFrame.max : Similar method for DataFrame.
396401
"""
397402
def max(scol):
398403
return F.when(
@@ -564,37 +569,262 @@ def count(self):
564569
--------
565570
Series.expanding : Calling object with Series data.
566571
DataFrame.expanding : Calling object with DataFrames.
572+
Series.count : Count of the full Series.
567573
DataFrame.count : Count of the full DataFrame.
568574
569575
Examples
570576
--------
571-
>>> s = ks.Series([2, 3, float("nan"), 10])
572-
>>> s.name = "col"
573-
>>> s.groupby(s).expanding().count().sort_index() # doctest: +NORMALIZE_WHITESPACE
574-
col
575-
2.0 0 1.0
576-
3.0 1 1.0
577-
10.0 3 1.0
578-
Name: col, dtype: float64
579-
580-
>>> df = s.to_frame()
581-
>>> df.groupby(df.col).expanding().count().sort_index() # doctest: +NORMALIZE_WHITESPACE
582-
col
583-
col
584-
2.0 0 1.0
585-
3.0 1 1.0
586-
10.0 3 1.0
577+
>>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
578+
>>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
579+
0
580+
2 0 NaN
581+
1 NaN
582+
3 2 NaN
583+
3 NaN
584+
4 9.0
585+
4 5 NaN
586+
6 NaN
587+
7 12.0
588+
8 16.0
589+
5 9 NaN
590+
10 NaN
591+
Name: 0, dtype: float64
592+
593+
For DataFrame, each expanding sum is computed column-wise.
594+
595+
>>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
596+
>>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
597+
A B
598+
A
599+
2 0 NaN NaN
600+
1 4.0 8.0
601+
3 2 NaN NaN
602+
3 6.0 18.0
603+
4 9.0 27.0
604+
4 5 NaN NaN
605+
6 8.0 32.0
606+
7 12.0 48.0
607+
8 16.0 64.0
608+
5 9 NaN NaN
609+
10 10.0 50.0
587610
"""
588611
return super(ExpandingGroupby, self).count()
589612

590613
def sum(self):
591-
raise NotImplementedError("groupby.expanding().sum() is currently not implemented yet.")
614+
"""
615+
Calculate expanding sum of given DataFrame or Series.
616+
617+
Returns
618+
-------
619+
Series or DataFrame
620+
Same type as the input, with the same index, containing the
621+
expanding sum.
622+
623+
See Also
624+
--------
625+
Series.expanding : Calling object with Series data.
626+
DataFrame.expanding : Calling object with DataFrames.
627+
Series.sum : Reducing sum for Series.
628+
DataFrame.sum : Reducing sum for DataFrame.
629+
630+
Examples
631+
--------
632+
>>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
633+
>>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
634+
0
635+
2 0 NaN
636+
1 NaN
637+
3 2 NaN
638+
3 NaN
639+
4 9.0
640+
4 5 NaN
641+
6 NaN
642+
7 12.0
643+
8 16.0
644+
5 9 NaN
645+
10 NaN
646+
Name: 0, dtype: float64
647+
648+
For DataFrame, each expanding sum is computed column-wise.
649+
650+
>>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
651+
>>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
652+
A B
653+
A
654+
2 0 NaN NaN
655+
1 4.0 8.0
656+
3 2 NaN NaN
657+
3 6.0 18.0
658+
4 9.0 27.0
659+
4 5 NaN NaN
660+
6 8.0 32.0
661+
7 12.0 48.0
662+
8 16.0 64.0
663+
5 9 NaN NaN
664+
10 10.0 50.0
665+
"""
666+
return super(ExpandingGroupby, self).sum()
592667

593668
def min(self):
594-
raise NotImplementedError("groupby.expanding().min() is currently not implemented yet.")
669+
"""
670+
Calculate the expanding minimum.
671+
672+
Returns
673+
-------
674+
Series or DataFrame
675+
Returned object type is determined by the caller of the expanding
676+
calculation.
677+
678+
See Also
679+
--------
680+
Series.expanding : Calling object with a Series.
681+
DataFrame.expanding : Calling object with a DataFrame.
682+
Series.min : Similar method for Series.
683+
DataFrame.min : Similar method for DataFrame.
684+
685+
Examples
686+
--------
687+
>>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
688+
>>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
689+
0
690+
2 0 NaN
691+
1 NaN
692+
3 2 NaN
693+
3 NaN
694+
4 9.0
695+
4 5 NaN
696+
6 NaN
697+
7 12.0
698+
8 16.0
699+
5 9 NaN
700+
10 NaN
701+
Name: 0, dtype: float64
702+
703+
For DataFrame, each expanding sum is computed column-wise.
704+
705+
>>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
706+
>>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
707+
A B
708+
A
709+
2 0 NaN NaN
710+
1 4.0 8.0
711+
3 2 NaN NaN
712+
3 6.0 18.0
713+
4 9.0 27.0
714+
4 5 NaN NaN
715+
6 8.0 32.0
716+
7 12.0 48.0
717+
8 16.0 64.0
718+
5 9 NaN NaN
719+
10 10.0 50.0
720+
"""
721+
return super(ExpandingGroupby, self).min()
595722

596723
def max(self):
597-
raise NotImplementedError("groupby.expanding().max() is currently not implemented yet.")
724+
"""
725+
Calculate the expanding maximum.
726+
727+
Returns
728+
-------
729+
Series or DataFrame
730+
Return type is determined by the caller.
731+
732+
See Also
733+
--------
734+
Series.expanding : Calling object with Series data.
735+
DataFrame.expanding : Calling object with DataFrames.
736+
Series.max : Similar method for Series.
737+
DataFrame.max : Similar method for DataFrame.
738+
739+
Examples
740+
--------
741+
>>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
742+
>>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
743+
0
744+
2 0 NaN
745+
1 NaN
746+
3 2 NaN
747+
3 NaN
748+
4 9.0
749+
4 5 NaN
750+
6 NaN
751+
7 12.0
752+
8 16.0
753+
5 9 NaN
754+
10 NaN
755+
Name: 0, dtype: float64
756+
757+
For DataFrame, each expanding sum is computed column-wise.
758+
759+
>>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
760+
>>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
761+
A B
762+
A
763+
2 0 NaN NaN
764+
1 4.0 8.0
765+
3 2 NaN NaN
766+
3 6.0 18.0
767+
4 9.0 27.0
768+
4 5 NaN NaN
769+
6 8.0 32.0
770+
7 12.0 48.0
771+
8 16.0 64.0
772+
5 9 NaN NaN
773+
10 10.0 50.0
774+
"""
775+
return super(ExpandingGroupby, self).max()
598776

599777
def mean(self):
600-
raise NotImplementedError("groupby.expanding().mean() is currently not implemented yet.")
778+
"""
779+
Calculate the expanding mean of the values.
780+
781+
Returns
782+
-------
783+
Series or DataFrame
784+
Returned object type is determined by the caller of the expanding
785+
calculation.
786+
787+
See Also
788+
--------
789+
Series.expanding : Calling object with Series data.
790+
DataFrame.expanding : Calling object with DataFrames.
791+
Series.mean : Equivalent method for Series.
792+
DataFrame.mean : Equivalent method for DataFrame.
793+
794+
Examples
795+
--------
796+
>>> s = ks.Series([2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5])
797+
>>> s.groupby(s).expanding(3).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
798+
0
799+
2 0 NaN
800+
1 NaN
801+
3 2 NaN
802+
3 NaN
803+
4 9.0
804+
4 5 NaN
805+
6 NaN
806+
7 12.0
807+
8 16.0
808+
5 9 NaN
809+
10 NaN
810+
Name: 0, dtype: float64
811+
812+
For DataFrame, each expanding sum is computed column-wise.
813+
814+
>>> df = ks.DataFrame({"A": s.to_numpy(), "B": s.to_numpy() ** 2})
815+
>>> df.groupby(df.A).expanding(2).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE
816+
A B
817+
A
818+
2 0 NaN NaN
819+
1 4.0 8.0
820+
3 2 NaN NaN
821+
3 6.0 18.0
822+
4 9.0 27.0
823+
4 5 NaN NaN
824+
6 8.0 32.0
825+
7 12.0 48.0
826+
8 16.0 64.0
827+
5 9 NaN NaN
828+
10 10.0 50.0
829+
"""
830+
return super(ExpandingGroupby, self).mean()

0 commit comments

Comments
 (0)