Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Index unique #912

Merged
merged 12 commits into from
Oct 22, 2019
47 changes: 47 additions & 0 deletions databricks/koalas/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,50 @@ def is_object(self):
"""
return is_object_dtype(self.dtype)

def unique(self, level=None):
"""
Return unique values in the index.
Be aware the order of unique values might be different than pandas.Index.unique

:param level: int or str, optional, default is None
:return: Index without deuplicates

Examples
--------
>>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique()
Int64Index([1, 3], dtype='int64')

>>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=['d', 'e', 'e']).index.unique()
Index(['e', 'd'], dtype='object')
"""
if level is not None:
self._validate_index_level(level)
sdf = self._kdf._sdf.select(self._scol).distinct()
return Index(DataFrame(self._kdf._internal.copy(sdf=sdf)), scol=self._scol)

def _validate_index_level(self, level):
"""
Validate index level.
For single-level Index getting level number is a no-op, but some
verification must be done like in MultiIndex.
"""
if isinstance(level, int):
if level < 0 and level != -1:
raise IndexError(
"Too many levels: Index has only 1 level,"
" %d is not a valid level number" % (level,)
)
elif level > 0:
raise IndexError(
"Too many levels:" " Index has only 1 level, not %d" % (level + 1)
)
elif level != self.name:
raise KeyError(
"Requested level ({}) does not match index name ({})".format(
level, self.name
)
)

def copy(self, name=None):
"""
Make a copy of this object. name sets those attributes on the new object.
Expand Down Expand Up @@ -460,6 +504,9 @@ def to_pandas(self) -> pd.MultiIndex:

toPandas = to_pandas

def unique(self, level=None):
raise PandasNotImplementedError(class_name='MultiIndex', method_name='unique')

# TODO: add 'name' parameter after pd.MultiIndex.name is implemented
def copy(self):
"""
Expand Down
2 changes: 0 additions & 2 deletions databricks/koalas/missing/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class _MissingPandasLikeIndex(object):
to_numpy = unsupported_function('to_numpy')
transpose = unsupported_function('transpose')
union = unsupported_function('union')
unique = unsupported_function('unique')
value_counts = unsupported_function('value_counts')
view = unsupported_function('view')
where = unsupported_function('where')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@charlesdong1991, sorry can you remove unique at _MissingPandasLikeMultiIndex as well?

Techinically MultiIndex has the method called unique. It's a bit complicated because MultiIndex inherits Index, so I suggested to throw an exception :-)..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, removed! @HyukjinKwon

i will get some free time recently, and see if I could add some properties and functions for Index and MultiIndex and then implement unique for MultiIndex if itholic hasn't implemented them all beforehand xD 😝

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be awesome!!

Expand Down Expand Up @@ -217,7 +216,6 @@ class _MissingPandasLikeMultiIndex(object):
transpose = unsupported_function('transpose')
truncate = unsupported_function('truncate')
union = unsupported_function('union')
unique = unsupported_function('unique')
value_counts = unsupported_function('value_counts')
view = unsupported_function('view')
where = unsupported_function('where')
Expand Down
15 changes: 15 additions & 0 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,21 @@ def test_multi_index_names(self):
with self.assertRaises(PandasNotImplementedError):
kidx.name = 'renamed'

def test_index_unique(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if I missed but did we have a test case for multi-index?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, i think their implementation is slightly different, and i thought the scope of this PR is for Index, not MultiIndex (I was supposed to do in a separate PR).

But do you prefer to have this unique function added for both Index and MultiIndex in this PR? @HyukjinKwon

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, it's fine to target to implement it only in Index. Can we explicitly throw a notimplemented exception manually?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added a PandasNotImplementedError, pls let me know if this is what you meant, thanks.

kidx = self.kdf.index

# here the output is different than pandas in terms of order
expected = pd.Int64Index([0, 6, 9, 5, 1, 3, 8], dtype='int64')

self.assert_eq(expected, kidx.unique())
self.assert_eq(expected, kidx.unique(level=0))

with self.assertRaisesRegexp(IndexError, "Too many levels*"):
kidx.unique(level=1)

with self.assertRaisesRegexp(KeyError, "Requested level (hi)*"):
kidx.unique(level='hi')

def test_multi_index_copy(self):
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
idx = pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
Expand Down