From cdd47a596f9da057b191bef19d60621b8adaa88a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 9 Oct 2019 22:29:57 +0200 Subject: [PATCH 01/11] Add index unique --- databricks/koalas/indexes.py | 40 +++++++++++++++++++++++++ databricks/koalas/tests/test_indexes.py | 13 ++++++++ 2 files changed, 53 insertions(+) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 352e6b4d0e..24c7c62a03 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -316,6 +316,46 @@ def is_object(self): """ return is_object_dtype(self.dtype) + def unique(self, level=None): + """ + Return unique values in the index. + + :param level: int or str, optional, default is None + :return: Index without deuplicates + + Examples + -------- + >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique() + Int64Index([1, 3], dtype='int64') + """ + if level is not None: + self._validate_index_level(level) + sdf = self._kdf._sdf.select(self._scol).distinct() + return Index(DataFrame(self._kdf._internal.copy(sdf=sdf)), scol=self._scol) + + def _validate_index_level(self, level): + """ + Validate index level. + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError( + "Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level,) + ) + elif level > 0: + raise IndexError( + "Too many levels:" " Index has only 1 level, not %d" % (level + 1) + ) + elif level != self.name: + raise KeyError( + "Requested level ({}) does not match index name ({})".format( + level, self.name + ) + ) + def __getattr__(self, item: str) -> Any: if hasattr(_MissingPandasLikeIndex, item): property_or_func = getattr(_MissingPandasLikeIndex, item) diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index fdfd23162b..8e05acb749 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -140,6 +140,19 @@ def test_multi_index_names(self): with self.assertRaises(PandasNotImplementedError): kidx.name = 'renamed' + def test_index_unique(self): + pidx = self.pdf.index + kidx = self.kdf.index + + self.assert_eq(pidx.unique(), kidx.unique()) + self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) + + with self.assertRaisesRegexp(IndexError, "*Too many levels*"): + kidx.unique(level=1) + + with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): + kidx.unique(level='hi') + def test_missing(self): kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) From e280ae361f93c0f744e7c61df857db8ebc699418 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 9 Oct 2019 22:32:58 +0200 Subject: [PATCH 02/11] fix typo --- databricks/koalas/tests/test_indexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 8e05acb749..89348417ca 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -147,7 +147,7 @@ def test_index_unique(self): self.assert_eq(pidx.unique(), kidx.unique()) self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) - with self.assertRaisesRegexp(IndexError, "*Too many levels*"): + with self.assertRaisesRegexp(IndexError, "Too many levels*"): kidx.unique(level=1) with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): From 6eaffb806f119dcc08a6d20fbfd8afdffcbfed0b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 9 Oct 2019 22:40:47 +0200 Subject: [PATCH 03/11] add more test cases --- databricks/koalas/indexes.py | 3 +++ databricks/koalas/tests/test_indexes.py | 31 ++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 24c7c62a03..21d1333391 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -327,6 +327,9 @@ def unique(self, level=None): -------- >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique() Int64Index([1, 3], dtype='int64') + + >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=['d', 'e', 'e']).index.unique() + Index(['e', 'd'], dtype='object') """ if level is not None: self._validate_index_level(level) diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 89348417ca..ac295ce5f6 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -40,6 +40,18 @@ def pdf(self): def kdf(self): return ks.from_pandas(self.pdf) + @property + def pdf_idx_str(self): + # to have test case where index are object + return pd.DataFrame({ + 'a': [1, 2, 3, 4, 5], + 'b': [3, 4, 5, 6, 7] + }, index=['a', 'b', 'b', 'e', 'e']) + + @property + def kdf_idx_str(self): + return ks.from_pandas(self.pdf_idx_str) + def test_index(self): for pdf in [pd.DataFrame(np.random.randn(10, 5), index=list('abcdefghij')), pd.DataFrame(np.random.randn(10, 5), @@ -141,17 +153,20 @@ def test_multi_index_names(self): kidx.name = 'renamed' def test_index_unique(self): - pidx = self.pdf.index - kidx = self.kdf.index + # test on both index are int and object + for pdf, kdf in zip([self.pdf.index, self.pdf_idx_str.index], + [self.kdf.index, self.kdf_idx_str.index]): + pidx = pdf.index + kidx = kdf.index - self.assert_eq(pidx.unique(), kidx.unique()) - self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) + self.assert_eq(pidx.unique(), kidx.unique()) + self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) - with self.assertRaisesRegexp(IndexError, "Too many levels*"): - kidx.unique(level=1) + with self.assertRaisesRegexp(IndexError, "Too many levels*"): + kidx.unique(level=1) - with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): - kidx.unique(level='hi') + with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): + kidx.unique(level='hi') def test_missing(self): kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) From 72ca8e96b81dea8e33d5b56eb6d6f7d518bf5a86 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 9 Oct 2019 23:17:17 +0200 Subject: [PATCH 04/11] remove missing --- databricks/koalas/missing/indexes.py | 1 - databricks/koalas/tests/test_indexes.py | 32 +++++++------------------ 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 494aedd289..5fcf3e1abe 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -104,7 +104,6 @@ class _MissingPandasLikeIndex(object): to_numpy = unsupported_function('to_numpy') transpose = unsupported_function('transpose') union = unsupported_function('union') - unique = unsupported_function('unique') value_counts = unsupported_function('value_counts') view = unsupported_function('view') where = unsupported_function('where') diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index ac295ce5f6..85e16b5360 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -40,18 +40,6 @@ def pdf(self): def kdf(self): return ks.from_pandas(self.pdf) - @property - def pdf_idx_str(self): - # to have test case where index are object - return pd.DataFrame({ - 'a': [1, 2, 3, 4, 5], - 'b': [3, 4, 5, 6, 7] - }, index=['a', 'b', 'b', 'e', 'e']) - - @property - def kdf_idx_str(self): - return ks.from_pandas(self.pdf_idx_str) - def test_index(self): for pdf in [pd.DataFrame(np.random.randn(10, 5), index=list('abcdefghij')), pd.DataFrame(np.random.randn(10, 5), @@ -153,20 +141,18 @@ def test_multi_index_names(self): kidx.name = 'renamed' def test_index_unique(self): - # test on both index are int and object - for pdf, kdf in zip([self.pdf.index, self.pdf_idx_str.index], - [self.kdf.index, self.kdf_idx_str.index]): - pidx = pdf.index - kidx = kdf.index - self.assert_eq(pidx.unique(), kidx.unique()) - self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) + pidx = self.pdf.index + kidx = self.kdf.index + + self.assert_eq(pidx.unique(), kidx.unique()) + self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) - with self.assertRaisesRegexp(IndexError, "Too many levels*"): - kidx.unique(level=1) + with self.assertRaisesRegexp(IndexError, "Too many levels*"): + kidx.unique(level=1) - with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): - kidx.unique(level='hi') + with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"): + kidx.unique(level='hi') def test_missing(self): kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) From 2ffb74fdca2df4837a27b8b1b6b1c121eeb308fd Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 10 Oct 2019 09:02:51 +0200 Subject: [PATCH 05/11] fix test --- databricks/koalas/indexes.py | 1 + databricks/koalas/tests/test_indexes.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 21d1333391..73a368bad3 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -319,6 +319,7 @@ def is_object(self): def unique(self, level=None): """ Return unique values in the index. + Be aware the order of unique values might be different than pandas.Index.unique :param level: int or str, optional, default is None :return: Index without deuplicates diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 85e16b5360..c60562ad3e 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -141,12 +141,13 @@ def test_multi_index_names(self): kidx.name = 'renamed' def test_index_unique(self): - - pidx = self.pdf.index kidx = self.kdf.index - self.assert_eq(pidx.unique(), kidx.unique()) - self.assert_eq(pidx.unique(level=0), kidx.unique(level=0)) + # here the output is different than pandas in terms of order + expected = pd.Int64Index([0, 6, 9, 5, 1, 3, 8], dtype='int64') + + self.assert_eq(expected, kidx.unique()) + self.assert_eq(expected, kidx.unique(level=0)) with self.assertRaisesRegexp(IndexError, "Too many levels*"): kidx.unique(level=1) From 037379b2a126dc1ae84411e8b672c3929b16c621 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 10 Oct 2019 09:40:17 +0200 Subject: [PATCH 06/11] Add reference --- docs/source/reference/index.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 34a6f00313..cbc6d9f075 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -12,3 +12,9 @@ API Reference window groupby ml + +Computations / Descriptive Stats +-------------------------------- +.. autosummary:: + :toctree: api/ + Index.unique From be58f57d075ebbac581773eaeb462981fbe2c37a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 10 Oct 2019 19:43:06 +0200 Subject: [PATCH 07/11] remove reference --- docs/source/reference/index.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index cbc6d9f075..34a6f00313 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -12,9 +12,3 @@ API Reference window groupby ml - -Computations / Descriptive Stats --------------------------------- -.. autosummary:: - :toctree: api/ - Index.unique From bb9b7236538f410b1f6d15e3e316b13d8b002201 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 10 Oct 2019 20:25:34 +0200 Subject: [PATCH 08/11] remove missing --- databricks/koalas/missing/indexes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 5fcf3e1abe..91c0fa8b43 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -218,7 +218,6 @@ class _MissingPandasLikeMultiIndex(object): transpose = unsupported_function('transpose') truncate = unsupported_function('truncate') union = unsupported_function('union') - unique = unsupported_function('unique') value_counts = unsupported_function('value_counts') view = unsupported_function('view') where = unsupported_function('where') From 3047c9a53dcb0bde53b22884567b4dd2b27edbfa Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 11 Oct 2019 18:55:10 +0200 Subject: [PATCH 09/11] add not implementation error --- databricks/koalas/indexes.py | 3 +++ databricks/koalas/missing/indexes.py | 1 + 2 files changed, 4 insertions(+) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index 73a368bad3..dffc9e1692 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -466,6 +466,9 @@ def to_pandas(self) -> pd.MultiIndex: toPandas = to_pandas + def unique(self, level=None): + raise NotImplementedError("unique function for MutliIndex is not implemented yet.") + def __getattr__(self, item: str) -> Any: if hasattr(_MissingPandasLikeMultiIndex, item): property_or_func = getattr(_MissingPandasLikeMultiIndex, item) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 91c0fa8b43..5fcf3e1abe 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -218,6 +218,7 @@ class _MissingPandasLikeMultiIndex(object): transpose = unsupported_function('transpose') truncate = unsupported_function('truncate') union = unsupported_function('union') + unique = unsupported_function('unique') value_counts = unsupported_function('value_counts') view = unsupported_function('view') where = unsupported_function('where') From 049748e3a5e8b52af9bef9c10710b34724ecee66 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 11 Oct 2019 23:17:49 +0200 Subject: [PATCH 10/11] change to pandas not imple --- databricks/koalas/indexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py index dffc9e1692..a9646b42cd 100644 --- a/databricks/koalas/indexes.py +++ b/databricks/koalas/indexes.py @@ -467,7 +467,7 @@ def to_pandas(self) -> pd.MultiIndex: toPandas = to_pandas def unique(self, level=None): - raise NotImplementedError("unique function for MutliIndex is not implemented yet.") + raise PandasNotImplementedError(class_name='MultiIndex', method_name='unique') def __getattr__(self, item: str) -> Any: if hasattr(_MissingPandasLikeMultiIndex, item): From 51dfb7e7846efa05c884b576acb74dfa42fa7fcf Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 21 Oct 2019 10:30:26 +0200 Subject: [PATCH 11/11] remove from mulindex in missing --- databricks/koalas/missing/indexes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py index 5fcf3e1abe..91c0fa8b43 100644 --- a/databricks/koalas/missing/indexes.py +++ b/databricks/koalas/missing/indexes.py @@ -218,7 +218,6 @@ class _MissingPandasLikeMultiIndex(object): transpose = unsupported_function('transpose') truncate = unsupported_function('truncate') union = unsupported_function('union') - unique = unsupported_function('unique') value_counts = unsupported_function('value_counts') view = unsupported_function('view') where = unsupported_function('where')