@@ -771,7 +771,8 @@ def _value_counts_arraylike(values, dropna):
771771 return keys , counts
772772
773773
774- def duplicated (values , keep = 'first' , return_inverse = False ):
774+ def duplicated (values , keep = 'first' , return_index = False , return_inverse = False ,
775+ stabilize = True ):
775776 """
776777 Return boolean ndarray denoting duplicate values.
777778
@@ -787,16 +788,32 @@ def duplicated(values, keep='first', return_inverse=False):
787788 - ``last`` : Mark duplicates as ``True`` except for the last
788789 occurrence.
789790 - False : Mark all duplicates as ``True``. This option is not
790- compatible with ``return_inverse``.
791+ compatible with ``return_index`` or ``return_inverse``.
792+ return_index : boolean, default False
793+ If True, also return the (array of) integer indices for the unique
794+ elements within values.
795+
796+ .. versionadded:: 0.24.0
791797 return_inverse : boolean, default False
792- Determines whether the mapping from unique elements to the original
793- index should be returned. If True, the output is a tuple.
798+ If True, also return the indices of the unique array that can be used
799+ to reconstruct values..
800+
801+ .. versionadded:: 0.24.0
802+ stabilize : boolean, default True
803+ This keyword is only relevant if index and/or inverse are returned. If
804+ True (the default), it will be ensured that index and inverse fit to
805+ the order of `values`. In case that index and inverse are not needed
806+ separately, but combined right away, this sorting process is
807+ unnecessary and can be disabled for improved performance by setting
808+ `stabilize=False`.
794809
795810 .. versionadded:: 0.24.0
796811
797812 Returns
798813 -------
799- duplicated : ndarray or or tuple of ndarray if return_inverse is True
814+ duplicated : ndarray or tuple of ndarray
815+ np.ndarray if both `return_index` and `return_inverse` are False.
816+ Otherwise, tuple of ndarray.
800817 """
801818
802819 if return_inverse and not keep :
@@ -808,33 +825,46 @@ def duplicated(values, keep='first', return_inverse=False):
808825 values , dtype , ndtype = _ensure_data (values )
809826 f = getattr (htable , "duplicated_{dtype}" .format (dtype = ndtype ))
810827 isdup = f (values , keep = keep )
811- if not return_inverse :
828+ if not ( return_index or return_inverse ) :
812829 return isdup
813830 elif not isdup .any ():
814831 # no need to calculate inverse if no duplicates
815832 inv = np .array (range (len (values )))
816- return isdup , inv
833+ return ( isdup ,) + ( inv ,) * return_index + ( inv ,) * return_inverse
817834
818835 if keep == 'first' :
819- # o2u: original indices to indices of ARRAY of unique values
820- # u2o: reduplication from array of unique values to original array
821- _ , o2u , u2o = np .unique (values , return_inverse = True ,
822- return_index = True )
823- inv = o2u [u2o ]
836+ # ind: original indices to indices of ARRAY of unique values
837+ # inv: reduplication from array of unique values to original array
838+ # this fits together in the way that values[ind] are the unique values
839+ # and values[ind][inv] == values
840+ _ , ind , inv = np .unique (values , return_index = True ,
841+ return_inverse = True )
824842 elif keep == 'last' :
825843 # np.unique takes first occurrence as unique value,
826- # so we flip ids that first becomes last
844+ # so we flip values that first becomes last
827845 values = values [::- 1 ]
828- _ , o2u , u2o = np .unique (values , return_inverse = True ,
829- return_index = True )
830- # the values in the ids-array correspond(ed) to the index of value ,
846+ _ , ind , inv = np .unique (values , return_index = True ,
847+ return_inverse = True )
848+ # the values in "values" correspond(ed) to the index of "values" ,
831849 # which is simply np.array(range(len(values))).
832- # By flipping ids around, we need to do the same for the index,
833- # ___because o2u and u2o are relative to that order___.
850+ # By flipping "values" around, we need to do the same for the index,
851+ # ___because ind and inv are relative to that order___.
834852 # Finally, to fit with the original order again, we need to flip the
835- # values around one last time.
836- inv = np .array (range (len (values )))[::- 1 ][o2u ][u2o ][::- 1 ]
837- return isdup , inv
853+ # result around one last time.
854+ ind , inv = np .array (range (len (values )))[::- 1 ][ind ], inv [::- 1 ]
855+
856+ if stabilize :
857+ # np.unique yields a __sorted__ list of uniques, and the index/inverse
858+ # are relative to this order. To restore the original order, we argsort
859+ # the returned index (corresponding to the mapping from values to
860+ # sorted, which is the wrong way around for us), and invert this
861+ # mapping once more (corresponding to the mapping from sorted back to
862+ # values), which is again done by argsorting.
863+ undo_sort = np .argsort (np .argsort (ind ))
864+ ind , inv = ind [undo_sort ], undo_sort [inv ]
865+
866+ res = (isdup ,) + (ind ,) * return_index + (inv ,) * return_inverse
867+ return res
838868
839869
840870def mode (values , dropna = True ):
0 commit comments