Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ Other API changes
Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`)
- When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`)
- :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`).

- Add ``ignore_index`` to :meth:`DataFrame.drop_duplicates` to reset index (:issue:`30114`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverse the ordering here, e.g. .drop_duplicates has gained the ignore_index keyword.

move to other enhancements

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved and rephrased!


.. _whatsnew_1000.api.documentation:

Expand Down
9 changes: 9 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4587,6 +4587,7 @@ def drop_duplicates(
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
keep: Union[str, bool] = "first",
inplace: bool = False,
ignore_index: bool = False,
) -> Optional["DataFrame"]:
"""
Return DataFrame with duplicate rows removed.
Expand All @@ -4606,6 +4607,8 @@ def drop_duplicates(
- False : Drop all duplicates.
inplace : bool, default False
Whether to drop duplicates in place or to return a copy.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.

Returns
-------
Expand All @@ -4621,8 +4624,14 @@ def drop_duplicates(
if inplace:
(inds,) = (-duplicated)._ndarray_values.nonzero()
new_data = self._data.take(inds)

if ignore_index:
new_data.axes[1] = ibase.default_index(len(inds))
self._update_inplace(new_data)
else:
if ignore_index:
idx = ibase.default_index(len(self[-duplicated]))
Copy link
Member

@WillAyd WillAyd Dec 23, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this block could be more succinctly written as:

result = self[~duplicated]
if ignore_index:
    result = result.reset_index(drop=True)

return result

Or something similar. FWIW I think the current method of evaluating self[~duplicated] twice can be costly for larger frames

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, good call! @WillAyd

will change! i think idx = ibase.default_index(sum(-duplicated)) should be also faster than using self[-duplicated]

return self[-duplicated].set_index(idx)
return self[-duplicated]

return None
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/frame/test_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,3 +477,23 @@ def test_drop_duplicates_inplace():
expected = orig2.drop_duplicates(["A", "B"], keep=False)
result = df2
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"origin_dict, output_dict, ignore_index, output_index",
[
({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
],
)
def test_drop_duplicates_ignore_index(
origin_dict, output_dict, ignore_index, output_index
):
# GH 30114
df = DataFrame(origin_dict)
result = df.drop_duplicates(ignore_index=ignore_index)

expected = DataFrame(output_dict, index=output_index)
tm.assert_frame_equal(result, expected)