Skip to content

Commit

Permalink
Add is_unique for Index & MultiIndex (#1766)
Browse files Browse the repository at this point in the history
This PR proposes `is_unique` for `Index` & `MultiIndex`.

Basically it's opposite to `has_duplicates`

```python
>>> idx = ks.Index([1, 5, 7, 7])
>>> idx.is_unique
False

>>> idx = ks.Index([1, 5, 7])
>>> idx.is_unique
True

>>> idx = ks.Index(["Watermelon", "Orange", "Apple",
...                 "Watermelon"])
>>> idx.is_unique
False

>>> idx = ks.Index(["Orange", "Apple",
...                 "Watermelon"])
>>> idx.is_unique
True
```

And fixed some tests to sync with pandas'.
  • Loading branch information
itholic authored Sep 14, 2020
1 parent f8e746e commit c62f83c
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 6 deletions.
45 changes: 39 additions & 6 deletions databricks/koalas/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,23 +456,56 @@ def has_duplicates(self) -> bool:
Examples
--------
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
>>> kdf.index.has_duplicates
>>> idx = ks.Index([1, 5, 7, 7])
>>> idx.has_duplicates
True
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
>>> kdf.index.has_duplicates
>>> idx = ks.Index([1, 5, 7])
>>> idx.has_duplicates
False
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
>>> kdf.index.has_duplicates
>>> idx = ks.Index(["Watermelon", "Orange", "Apple",
... "Watermelon"])
>>> idx.has_duplicates
True
>>> idx = ks.Index(["Orange", "Apple",
... "Watermelon"])
>>> idx.has_duplicates
False
"""
sdf = self._internal.spark_frame.select(self.spark.column)
scol = scol_for(sdf, sdf.columns[0])

return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]

@property
def is_unique(self) -> bool:
"""
Return if the index has unique values.
Examples
--------
>>> idx = ks.Index([1, 5, 7, 7])
>>> idx.is_unique
False
>>> idx = ks.Index([1, 5, 7])
>>> idx.is_unique
True
>>> idx = ks.Index(["Watermelon", "Orange", "Apple",
... "Watermelon"])
>>> idx.is_unique
False
>>> idx = ks.Index(["Orange", "Apple",
... "Watermelon"])
>>> idx.is_unique
True
"""
return not self.has_duplicates

@property
def name(self) -> Union[str, Tuple[str, ...]]:
"""Return name of the Index."""
Expand Down
26 changes: 26 additions & 0 deletions databricks/koalas/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,3 +1445,29 @@ def test_inferred_type(self):
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
kmidx = ks.from_pandas(pmidx)
self.assert_eq(pmidx.inferred_type, kmidx.inferred_type)

def test_index_is_unique(self):
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
names = [None, "ks", "ks", None]
is_uniq = [True, False, False, True]

for idx, name, expected in zip(indexes, names, is_uniq):
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
kdf = ks.from_pandas(pdf)

self.assertEqual(kdf.index.is_unique, expected)

def test_multiindex_is_unique(self):
indexes = [
[list("abc"), list("edf")],
[list("aac"), list("edf")],
[list("aac"), list("eef")],
[[1, 4, 4], [4, 6, 6]],
]
is_uniq = [True, True, False, False]

for idx, expected in zip(indexes, is_uniq):
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
kdf = ks.from_pandas(pdf)

self.assertEqual(kdf.index.is_unique, expected)
1 change: 1 addition & 0 deletions docs/source/reference/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Properties
Index.is_monotonic
Index.is_monotonic_increasing
Index.is_monotonic_decreasing
Index.is_unique
Index.has_duplicates
Index.hasnans
Index.dtype
Expand Down

0 comments on commit c62f83c

Please sign in to comment.