Add tests/asv for Series/(Multi)Index; refactor

h-vetinari · h-vetinari · commit 8acfe885efcf · 2018-07-05T17:27:14.000+02:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -412,10 +412,13 @@ def time_frame_nunique(self):
 class Duplicated(object):
 
     goal_time = 0.2
-    params = (['first', 'last'], [True, False])
+    params = (['first', 'last', False], [True, False])
     param_names = ['keep', 'return_inverse']
 
     def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
         n = (1 << 20)
         t = date_range('2015-01-01', freq='S', periods=(n // 64))
         xs = np.random.randn(n // 64).round(2)
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
         self.index % 2
 
 
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.idx = Index(base[np.random.choice(n, k * n)])
+
+    def time_duplicated(self, keep, return_inverse):
+        self.idx.duplicated(keep=keep, return_inverse=return_inverse)
+
+
 class Range(object):
 
     goal_time = 0.2
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
 class Duplicated(object):
 
     goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
 
-    def setup(self):
-        n, k = 200, 5000
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
         levels = [np.arange(n),
                   tm.makeStringIndex(n).values,
                   1000 + np.arange(n)]
         labels = [np.random.choice(n, (k * n)) for lev in levels]
         self.mi = MultiIndex(levels=levels, labels=labels)
 
-    def time_duplicated(self):
-        self.mi.duplicated()
+    def time_duplicated(self, keep, return_inverse):
+        self.mi.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class Sortlevel(object):
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -134,3 +134,21 @@ def setup(self):
 
     def time_series_datetimeindex_repr(self):
         getattr(self.s, 'a', None)
+
+
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.s = Series(base[np.random.choice(n, k * n)])
+
+    def time_series_duplicated(self, keep, return_inverse):
+        self.s.duplicated(keep=keep, return_inverse=return_inverse)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -8,50 +8,71 @@ v0.24.0
 New features
 ~~~~~~~~~~~~
 
+.. _whatsnew_0240.enhancements.extension_array_operators
+
+``ExtensionArray`` operator support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison
+operators. (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``:
+
+1. Define each of the operators on your ``ExtensionArray`` subclass.
+2. Use an operator implementation from pandas that depends on operators that are already defined
+   on the underlying elements (scalars) of the ``ExtensionArray``.
+
+See the :ref:`ExtensionArray Operator Support
+<extending.extension.operator>` documentation section for details on both
+ways of adding operator support.
+
 .. _whatsnew_0240.enhancements.duplicated_inverse:
 
-``DataFrame.duplicated`` has gained the ``return_inverse`` kwarg
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``duplicated``-method has gained the ``return_inverse`` kwarg
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavours of ``Index`` has gained a ``return_inverse`` keyword,
+which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
+that allows reconstructing the original object from the deduplicated, unique subset.
 
-The ``duplicated``-method has gained a ``return_inverse`` keyword. Specifying ``return_inverse=True`` will change the output from a single Series
-to a tuple of two Series, where the second Series contains the mapping from the indices of the deduplicated, unique subset back to the original index:
+For ``Index`` objects, the inverse is an ``np.ndarray``:
 
 .. ipython:: python
 
-    df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
-                      index=[1, 4, 9, 16, 25])
-    df
-    isduplicate, inverse = df.duplicated(return_inverse=True)  # default: keep='first'
+    idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
+    idx.has_duplicates
+    isduplicate, inverse = idx.duplicated(return_inverse=True)  # default: keep='first'
     isduplicate
     inverse
 
-This allows to reconstruct the original DataFrame as follows:
+This allows to reconstruct the original ``Index`` as follows:
 
 .. ipython:: python
 
-    unique = df.loc[~isduplicate]  # same as df.drop_duplicates()
+    unique = idx[~isduplicate]  # same as idx.drop_duplicates()
     unique
-    reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
-    reconstruct.equals(df)
 
-The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
-to construct an inverse).
+    reconstruct = unique[inverse]
+    reconstruct.equals(idx)
 
-.. _whatsnew_0240.enhancements.extension_array_operators
+For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
+which contains the mapping from the index of the deduplicated, unique subset back to the original index.
 
-``ExtensionArray`` operator support
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. ipython:: python
 
-A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison
-operators. (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``:
+    df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                      index=[1, 4, 9, 16, 25])
+    df
+    isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
+    isduplicate
+    inverse
 
-1. Define each of the operators on your ``ExtensionArray`` subclass.
-2. Use an operator implementation from pandas that depends on operators that are already defined
-   on the underlying elements (scalars) of the ``ExtensionArray``.
+    unique = df.loc[~isduplicate]  # same as df.drop_duplicates(keep='last')
+    unique
+    reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
+    reconstruct.equals(df)
+
+The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
+to construct an inverse).
 
-See the :ref:`ExtensionArray Operator Support
-<extending.extension.operator>` documentation section for details on both
-ways of adding operator support.
 
 .. _whatsnew_0240.enhancements.other:
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -771,7 +771,8 @@ def _value_counts_arraylike(values, dropna):
     return keys, counts
 
 
-def duplicated(values, keep='first', return_inverse=False):
+def duplicated(values, keep='first', return_index=False, return_inverse=False,
+               stabilize=True):
     """
     Return boolean ndarray denoting duplicate values.
 
@@ -787,16 +788,32 @@ def duplicated(values, keep='first', return_inverse=False):
         - ``last`` : Mark duplicates as ``True`` except for the last
           occurrence.
         - False : Mark all duplicates as ``True``. This option is not
-          compatible with ``return_inverse``.
+          compatible with ``return_index`` or ``return_inverse``.
+    return_index : boolean, default False
+        If True, also return the (array of) integer indices for the unique
+        elements within values.
+
+        .. versionadded:: 0.24.0
     return_inverse : boolean, default False
-        Determines whether the mapping from unique elements to the original
-        index should be returned. If True, the output is a tuple.
+        If True, also return the indices of the unique array that can be used
+        to reconstruct values..
+
+        .. versionadded:: 0.24.0
+    stabilize : boolean, default True
+        This keyword is only relevant if index and/or inverse are returned. If
+        True (the default), it will be ensured that index and inverse fit to
+        the order of `values`. In case that index and inverse are not needed
+        separately, but combined right away, this sorting process is
+        unnecessary and can be disabled for improved performance by setting
+        `stabilize=False`.
 
         .. versionadded:: 0.24.0
 
     Returns
     -------
-    duplicated : ndarray or or tuple of ndarray if return_inverse is True
+    duplicated : ndarray or tuple of ndarray
+        np.ndarray if both `return_index` and `return_inverse` are False.
+        Otherwise, tuple of ndarray.
     """
 
     if return_inverse and not keep:
@@ -808,33 +825,46 @@ def duplicated(values, keep='first', return_inverse=False):
     values, dtype, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
     isdup = f(values, keep=keep)
-    if not return_inverse:
+    if not (return_index or return_inverse):
         return isdup
     elif not isdup.any():
         # no need to calculate inverse if no duplicates
         inv = np.array(range(len(values)))
-        return isdup, inv
+        return (isdup,) + (inv,) * return_index + (inv,) * return_inverse
 
     if keep == 'first':
-        # o2u: original indices to indices of ARRAY of unique values
-        # u2o: reduplication from array of unique values to original array
-        _, o2u, u2o = np.unique(values, return_inverse=True,
-                                return_index=True)
-        inv = o2u[u2o]
+        # ind: original indices to indices of ARRAY of unique values
+        # inv: reduplication from array of unique values to original array
+        # this fits together in the way that values[ind] are the unique values
+        # and values[ind][inv] == values
+        _, ind, inv = np.unique(values, return_index=True,
+                                return_inverse=True)
     elif keep == 'last':
         # np.unique takes first occurrence as unique value,
-        # so we flip ids that first becomes last
+        # so we flip values that first becomes last
         values = values[::-1]
-        _, o2u, u2o = np.unique(values, return_inverse=True,
-                                return_index=True)
-        # the values in the ids-array correspond(ed) to the index of value,
+        _, ind, inv = np.unique(values, return_index=True,
+                                return_inverse=True)
+        # the values in "values" correspond(ed) to the index of "values",
         # which is simply np.array(range(len(values))).
-        # By flipping ids around, we need to do the same for the index,
-        # ___because o2u and u2o are relative to that order___.
+        # By flipping "values" around, we need to do the same for the index,
+        # ___because ind and inv are relative to that order___.
         # Finally, to fit with the original order again, we need to flip the
-        # values around one last time.
-        inv = np.array(range(len(values)))[::-1][o2u][u2o][::-1]
-    return isdup, inv
+        # result around one last time.
+        ind, inv = np.array(range(len(values)))[::-1][ind], inv[::-1]
+
+    if stabilize:
+        # np.unique yields a __sorted__ list of uniques, and the index/inverse
+        # are relative to this order. To restore the original order, we argsort
+        # the returned index (corresponding to the mapping from values to
+        # sorted, which is the wrong way around for us), and invert this
+        # mapping once more (corresponding to the mapping from sorted back to
+        # values), which is again done by argsorting.
+        undo_sort = np.argsort(np.argsort(ind))
+        ind, inv = ind[undo_sort], undo_sort[inv]
+
+    res = (isdup,) + (ind,) * return_index + (inv,) * return_inverse
+    return res
 
 
 def mode(values, dropna=True):
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1266,11 +1266,15 @@ def duplicated(self, keep='first', return_inverse=False):
             return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
 
-        isdup_array, inv_array = duplicated(self, keep=keep,
-                                            return_inverse=return_inverse)
+        # return_inverse = True
+        isdup_array, ind_array, inv_array = duplicated(self, keep=keep,
+                                                       return_index=True,
+                                                       return_inverse=True,
+                                                       stabilize=False)
         isdup = self._constructor(isdup_array,
                                   index=self.index).__finalize__(self)
-        inv = self._constructor(self.index[inv_array], index=self.index)
+        inv = self._constructor(self.index[ind_array][inv_array],
+                                index=self.index)
         return isdup, inv
 
     # ----------------------------------------------------------------------
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4363,8 +4363,9 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
             - False : Mark all duplicates as ``True``. This option is not
               compatible with ``return_inverse``.
         return_inverse : boolean, default False
-            Determines whether the mapping from unique elements to the original
-            index should be returned. If True, the output is a tuple.
+            Determines whether the mapping from (the index of) unique elements
+            to the original index should be returned. If True, the output is
+            a tuple.
 
             .. versionadded:: 0.24.0
 
@@ -4409,10 +4410,13 @@ def f(vals):
         if not return_inverse:
             return Series(duplicated(ids, keep=keep), index=self.index)
 
-        isdup_array, inv_array = duplicated(ids, keep=keep,
-                                            return_inverse=return_inverse)
+        # return_inverse = True
+        isdup_array, ind_array, inv_array = duplicated(ids, keep=keep,
+                                                       return_index=True,
+                                                       return_inverse=True,
+                                                       stabilize=False)
         isdup = Series(isdup_array, index=self.index)
-        inv = Series(self.index[inv_array], index=self.index)
+        inv = Series(self.index[ind_array][inv_array], index=self.index)
         return isdup, inv
 
     # ----------------------------------------------------------------------
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4456,8 +4456,9 @@ def duplicated(self, keep='first', return_inverse=False):
             - ``False`` : Mark all duplicates as ``True``. This option is not
               compatible with ``return_inverse``.
         return_inverse : boolean, default False
-            Determines whether the mapping from unique elements to the original
-            index should be returned. If True, the output is a tuple.
+            Determines whether the mapping from (the index of) unique elements
+            to the original index should be returned. If True, the output is
+            a tuple.
 
             .. versionadded:: 0.24.0
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -392,10 +392,10 @@ def unique(self, level=None):
                                   ordered=result.ordered)
 
     @Appender(Index.duplicated.__doc__)
-    def duplicated(self, keep='first'):
-        from pandas._libs.hashtable import duplicated_int64
+    def duplicated(self, keep='first', return_inverse=False):
+        from pandas.core.algorithms import duplicated
         codes = self.codes.astype('i8')
-        return duplicated_int64(codes, keep)
+        return duplicated(codes, keep=keep, return_inverse=return_inverse)
 
     def _to_safe_for_reshape(self):
         """ convert to object if we are a categorical """
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1601,8 +1601,9 @@ def duplicated(self, keep='first', return_inverse=False):
             - ``False`` : Mark all duplicates as ``True``. This option is not
               compatible with ``return_inverse``.
         return_inverse : boolean, default False
-            Determines whether the mapping from unique elements to the original
-            index should be returned. If True, the output is a tuple.
+            Determines whether the mapping from (the index of) unique elements
+            to the original index should be returned. If True, the output is
+            a tuple.
 
             .. versionadded:: 0.24.0
 
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
diff --git a/pandas/tests/indexes/multi/test_unique_and_duplicates.py b/pandas/tests/indexes/multi/test_unique_and_duplicates.py
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py