pandas-dev
diff --git a/‎doc/source/indexing.rst‎
Lines changed: 6 additions & 4 deletions b/‎doc/source/indexing.rst‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎doc/source/whatsnew/v0.17.0.txt‎
Lines changed: 12 additions & 0 deletions b/‎doc/source/whatsnew/v0.17.0.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎pandas/core/base.py‎
Lines changed: 18 additions & 9 deletions b/‎pandas/core/base.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎pandas/core/frame.py‎
Lines changed: 18 additions & 9 deletions b/‎pandas/core/frame.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎pandas/core/index.py‎
Lines changed: 13 additions & 9 deletions b/‎pandas/core/index.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎pandas/core/series.py‎
Lines changed: 7 additions & 6 deletions b/‎pandas/core/series.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎pandas/hashtable.pyx‎
Lines changed: 17 additions & 3 deletions b/‎pandas/hashtable.pyx‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎pandas/lib.pyx‎
Lines changed: 19 additions & 7 deletions b/‎pandas/lib.pyx‎
Lines changed: 19 additions & 7 deletions
@@ -1209,17 +1209,19 @@ takes as an argument the columns to use to identify duplicated rows.
 - ``drop_duplicates`` removes duplicate rows.
 
 By default, the first observed row of a duplicate set is considered unique, but
-each method has a ``take_last`` parameter that indicates the last observed row
-should be taken instead.
+each method has a ``keep`` parameter to specify targets.
 
 .. ipython:: python
 
    df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                     'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                     'c' : np.random.randn(7)})
    df2.duplicated(['a','b'])
+   df2.duplicated(['a','b'], keep='last')
+   df2.duplicated(['a','b'], keep=False)
    df2.drop_duplicates(['a','b'])
-   df2.drop_duplicates(['a','b'], take_last=True)
+   df2.drop_duplicates(['a','b'], keep='last')
+   df2.drop_duplicates(['a','b'], keep=False)
 
 An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.
 
@@ -1230,7 +1232,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
    df3.groupby(level=0).first()
 
    # a bit more verbose
-   df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
+   df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b')
 
 .. _indexing.dictionarylike:
 
 
@@ -26,6 +26,16 @@ New features
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. (:issue:`6511`, :issue:`8505`)
+
+.. ipython :: python
+
+   s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
+   s.drop_duplicates()
+   s.drop_duplicates(keep='last')
+   s.drop_duplicates(keep=False)
+
+
 .. _whatsnew_0170.api:
 
 Backwards incompatible API changes
@@ -45,6 +55,8 @@ Other API Changes
 Deprecations
 ^^^^^^^^^^^^
 
+- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
+
 .. _whatsnew_0170.prior_deprecations:
 
 Removal of prior version deprecations/changes
 
@@ -6,7 +6,7 @@
 from pandas.core import common as com
 import pandas.core.nanops as nanops
 import pandas.lib as lib
-from pandas.util.decorators import Appender, cache_readonly
+from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
 from pandas.core.strings import StringMethods
 from pandas.core.common import AbstractMethodError
 
@@ -543,18 +543,23 @@ def _dir_deletions(self):
 
         Parameters
         ----------
-        take_last : boolean, default False
-            Take the last observed index in a group. Default first
+
+        keep : {'first', 'last', False}, default 'first'
+            - ``first`` : Drop duplicates except for the first occurrence.
+            - ``last`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+        take_last : deprecated
         %(inplace)s
 
         Returns
         -------
         deduplicated : %(klass)s
         """)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
-    def drop_duplicates(self, take_last=False, inplace=False):
-        duplicated = self.duplicated(take_last=take_last)
+    def drop_duplicates(self, keep='first', inplace=False):
+        duplicated = self.duplicated(keep=keep)
         result = self[np.logical_not(duplicated)]
         if inplace:
             return self._update_inplace(result)
@@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
 
         Parameters
         ----------
-        take_last : boolean, default False
-            Take the last observed index in a group. Default first
+        keep : {'first', 'last', False}, default 'first'
+            - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
+            - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
+            - False : Mark all duplicates as ``True``.
+        take_last : deprecated
 
         Returns
         -------
         duplicated : %(duplicated)s
         """)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
-    def duplicated(self, take_last=False):
+    def duplicated(self, keep='first'):
         keys = com._ensure_object(self.values)
-        duplicated = lib.duplicated(keys, take_last=take_last)
+        duplicated = lib.duplicated(keys, keep=keep)
         try:
             return self._constructor(duplicated,
                                      index=self.index).__finalize__(self)
 
@@ -2801,8 +2801,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
         else:
             return result
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
-    def drop_duplicates(self, subset=None, take_last=False, inplace=False):
+    def drop_duplicates(self, subset=None, keep='first', inplace=False):
         """
         Return DataFrame with duplicate rows removed, optionally only
         considering certain columns
@@ -2812,8 +2813,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
         subset : column label or sequence of labels, optional
             Only consider certain columns for identifying duplicates, by
             default use all of the columns
-        take_last : boolean, default False
-            Take the last observed row in a row. Defaults to the first row
+        keep : {'first', 'last', False}, default 'first'
+            - ``first`` : Drop duplicates except for the first occurrence.
+            - ``last`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+        take_last : deprecated
         inplace : boolean, default False
             Whether to drop duplicates in place or to return a copy
         cols : kwargs only argument of subset [deprecated]
@@ -2822,7 +2826,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
         -------
         deduplicated : DataFrame
         """
-        duplicated = self.duplicated(subset, take_last=take_last)
+        duplicated = self.duplicated(subset, keep=keep)
 
         if inplace:
             inds, = (-duplicated).nonzero()
@@ -2831,8 +2835,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
         else:
             return self[-duplicated]
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
-    def duplicated(self, subset=None, take_last=False):
+    def duplicated(self, subset=None, keep='first'):
         """
         Return boolean Series denoting duplicate rows, optionally only
         considering certain columns
@@ -2842,9 +2847,13 @@ def duplicated(self, subset=None, take_last=False):
         subset : column label or sequence of labels, optional
             Only consider certain columns for identifying duplicates, by
             default use all of the columns
-        take_last : boolean, default False
-            For a set of distinct duplicate rows, flag all but the last row as
-            duplicated. Default is for all but the first row to be flagged
+        keep : {'first', 'last', False}, default 'first'
+            - ``first`` : Mark duplicates as ``True`` except for the
+              first occurrence.
+            - ``last`` : Mark duplicates as ``True`` except for the
+              last occurrence.
+            - False : Mark all duplicates as ``True``.
+        take_last : deprecated
         cols : kwargs only argument of subset [deprecated]
 
         Returns
@@ -2870,7 +2879,7 @@ def f(vals):
         labels, shape = map(list, zip( * map(f, vals)))
 
         ids = get_group_index(labels, shape, sort=False, xnull=False)
-        return Series(duplicated_int64(ids, take_last), index=self.index)
+        return Series(duplicated_int64(ids, keep), index=self.index)
 
     #----------------------------------------------------------------------
     # Sorting
 
@@ -16,7 +16,7 @@
 from pandas.lib import Timestamp, Timedelta, is_datetime_array
 from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
 from pandas.util.decorators import (Appender, Substitution, cache_readonly,
-                                    deprecate)
+                                    deprecate, deprecate_kwarg)
 import pandas.core.common as com
 from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
                                 _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
@@ -2571,14 +2571,16 @@ def drop(self, labels, errors='raise'):
             indexer = indexer[~mask]
         return self.delete(indexer)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
-    def drop_duplicates(self, take_last=False):
-        result = super(Index, self).drop_duplicates(take_last=take_last)
+    def drop_duplicates(self, keep='first'):
+        result = super(Index, self).drop_duplicates(keep=keep)
         return self._constructor(result)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
-    def duplicated(self, take_last=False):
-        return super(Index, self).duplicated(take_last=take_last)
+    def duplicated(self, keep='first'):
+        return super(Index, self).duplicated(keep=keep)
 
 
     def _evaluate_with_timedelta_like(self, other, op, opstr):
@@ -2997,10 +2999,11 @@ def _engine(self):
     def is_unique(self):
         return not self.duplicated().any()
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
-    def duplicated(self, take_last=False):
+    def duplicated(self, keep='first'):
         from pandas.hashtable import duplicated_int64
-        return duplicated_int64(self.codes.astype('i8'), take_last)
+        return duplicated_int64(self.codes.astype('i8'), keep)
 
     def get_loc(self, key, method=None):
         """
@@ -4147,15 +4150,16 @@ def _has_complex_internals(self):
     def is_unique(self):
         return not self.duplicated().any()
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
-    def duplicated(self, take_last=False):
+    def duplicated(self, keep='first'):
         from pandas.core.groupby import get_group_index
         from pandas.hashtable import duplicated_int64
 
         shape = map(len, self.levels)
         ids = get_group_index(self.labels, shape, sort=False, xnull=False)
 
-        return duplicated_int64(ids, take_last)
+        return duplicated_int64(ids, keep)
 
     def get_value(self, series, key):
         # somewhat broken encapsulation
 
@@ -44,7 +44,7 @@
 import pandas.core.datetools as datetools
 import pandas.core.format as fmt
 import pandas.core.nanops as nanops
-from pandas.util.decorators import Appender, cache_readonly
+from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
 
 import pandas.lib as lib
 import pandas.tslib as tslib
@@ -1137,14 +1137,15 @@ def mode(self):
         from pandas.core.algorithms import mode
         return mode(self)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
-    def drop_duplicates(self, take_last=False, inplace=False):
-        return super(Series, self).drop_duplicates(take_last=take_last,
-                                                   inplace=inplace)
+    def drop_duplicates(self, keep='first', inplace=False):
+        return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
-    def duplicated(self, take_last=False):
-        return super(Series, self).duplicated(take_last=take_last)
+    def duplicated(self, keep='first'):
+        return super(Series, self).duplicated(keep=keep)
 
     def idxmin(self, axis=None, out=None, skipna=True):
         """
 
@@ -1051,23 +1051,37 @@ def mode_int64(ndarray[int64_t] values):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
+def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     cdef:
         int ret = 0
         Py_ssize_t i, n = len(values)
         kh_int64_t * table = kh_init_int64()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
 
+        # for keep=all to preserve occurrence loc
+        dict seen = dict()
+
     kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
 
-    if take_last:
+    if keep == 'last':
         for i from n > i >=0:
             kh_put_int64(table, values[i], &ret)
             out[i] = ret == 0
-    else:
+    elif keep == 'first':
         for i from 0 <= i < n:
             kh_put_int64(table, values[i], &ret)
             out[i] = ret == 0
+    elif keep is False:
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                out[i] = 1
+                out[seen[row]] = 1
+            else:
+                seen[row] = i
+                out[i] = 0
+    else:
+        raise ValueError('keep must be either "first", "last" or False')
 
     kh_destroy_int64(table)
     return out
 
@@ -1292,35 +1292,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
 
     return result
 
-def duplicated(ndarray[object] values, take_last=False):
+
+def duplicated(ndarray[object] values, object keep='first'):
     cdef:
         Py_ssize_t i, n
-        set seen = set()
+        dict seen = dict()
         object row
 
     n = len(values)
     cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
 
-    if take_last:
+    if keep == 'last':
         for i from n > i >= 0:
             row = values[i]
-
             if row in seen:
                 result[i] = 1
             else:
-                seen.add(row)
+                seen[row] = i
                 result[i] = 0
-    else:
+    elif keep == 'first':
         for i from 0 <= i < n:
             row = values[i]
             if row in seen:
                 result[i] = 1
             else:
-                seen.add(row)
+                seen[row] = i
                 result[i] = 0
+    elif keep is False:
+        for i from 0 <= i < n:
+            row = values[i]
+            if row in seen:
+                result[i] = 1
+                result[seen[row]] = 1
+            else:
+                seen[row] = i
+                result[i] = 0
+    else:
+        raise ValueError('keep must be either "first", "last" or False')
 
     return result.view(np.bool_)
 
+
 def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
     cdef:
         Py_ssize_t i, group_size, n, start