Add is_unique for Index & MultiIndex (#1766)

This PR proposes `is_unique` for `Index` & `MultiIndex`. Basically it's opposite to `has_duplicates` ```python >>> idx = ks.Index([1, 5, 7, 7]) >>> idx.is_unique False >>> idx = ks.Index([1, 5, 7]) >>> idx.is_unique True >>> idx = ks.Index(["Watermelon", "Orange", "Apple", ... "Watermelon"]) >>> idx.is_unique False >>> idx = ks.Index(["Orange", "Apple", ... "Watermelon"]) >>> idx.is_unique True ``` And fixed some tests to sync with pandas'.
databricks · Sep 14, 2020 · c62f83c · c62f83c
1 parent f8e746e
commit c62f83c
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 6 deletions.
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -456,23 +456,56 @@ def has_duplicates(self) -> bool:
 
         Examples
         --------
-        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
-        >>> kdf.index.has_duplicates
+        >>> idx = ks.Index([1, 5, 7, 7])
+        >>> idx.has_duplicates
         True
 
-        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
-        >>> kdf.index.has_duplicates
+        >>> idx = ks.Index([1, 5, 7])
+        >>> idx.has_duplicates
         False
 
-        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
-        >>> kdf.index.has_duplicates
+        >>> idx = ks.Index(["Watermelon", "Orange", "Apple",
+        ...                 "Watermelon"])
+        >>> idx.has_duplicates
         True
+
+        >>> idx = ks.Index(["Orange", "Apple",
+        ...                 "Watermelon"])
+        >>> idx.has_duplicates
+        False
         """
         sdf = self._internal.spark_frame.select(self.spark.column)
         scol = scol_for(sdf, sdf.columns[0])
 
         return sdf.select(F.count(scol) != F.countDistinct(scol)).first()[0]
 
+    @property
+    def is_unique(self) -> bool:
+        """
+        Return if the index has unique values.
+
+        Examples
+        --------
+        >>> idx = ks.Index([1, 5, 7, 7])
+        >>> idx.is_unique
+        False
+
+        >>> idx = ks.Index([1, 5, 7])
+        >>> idx.is_unique
+        True
+
+        >>> idx = ks.Index(["Watermelon", "Orange", "Apple",
+        ...                 "Watermelon"])
+        >>> idx.is_unique
+        False
+
+        >>> idx = ks.Index(["Orange", "Apple",
+        ...                 "Watermelon"])
+        >>> idx.is_unique
+        True
+        """
+        return not self.has_duplicates
+
     @property
     def name(self) -> Union[str, Tuple[str, ...]]:
         """Return name of the Index."""

diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py
@@ -1445,3 +1445,29 @@ def test_inferred_type(self):
         pmidx = pd.MultiIndex.from_tuples([("a", "x")])
         kmidx = ks.from_pandas(pmidx)
         self.assert_eq(pmidx.inferred_type, kmidx.inferred_type)
+
+    def test_index_is_unique(self):
+        indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
+        names = [None, "ks", "ks", None]
+        is_uniq = [True, False, False, True]
+
+        for idx, name, expected in zip(indexes, names, is_uniq):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
+            kdf = ks.from_pandas(pdf)
+
+            self.assertEqual(kdf.index.is_unique, expected)
+
+    def test_multiindex_is_unique(self):
+        indexes = [
+            [list("abc"), list("edf")],
+            [list("aac"), list("edf")],
+            [list("aac"), list("eef")],
+            [[1, 4, 4], [4, 6, 6]],
+        ]
+        is_uniq = [True, True, False, False]
+
+        for idx, expected in zip(indexes, is_uniq):
+            pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
+            kdf = ks.from_pandas(pdf)
+
+            self.assertEqual(kdf.index.is_unique, expected)
diff --git a/docs/source/reference/indexing.rst b/docs/source/reference/indexing.rst
@@ -21,6 +21,7 @@ Properties
    Index.is_monotonic
    Index.is_monotonic_increasing
    Index.is_monotonic_decreasing
+   Index.is_unique
    Index.has_duplicates
    Index.hasnans
    Index.dtype