-
-
Couldn't load subscription status.
- Fork 19.2k
ENH/API: ExtensionArray.factorize #20361
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0ec3600
4707273
b61fb8d
44b6d72
5be3917
e474c20
c2578c3
b73e303
0db9e97
baf624c
ce92f7b
8cbfc36
425fb2a
7bbe796
31ed4c9
434df7d
505ad44
77a10b6
b59656f
201e029
9b0c2a9
eb19488
cbfee1a
35a8977
7efece2
ef8e6cb
dd3bf1d
6a6034f
5c758aa
5526398
cd5c2db
d5e8198
30941cb
3574273
c776133
2a79315
6ca65f8
bbedd8c
96ecab7
1010417
c288d67
55c9e31
163bfa3
872c24a
3c18428
703ab8a
ab32e0f
62fa538
28fad50
8580754
cf14ee1
8141131
a23d451
b25f3d4
dfcda85
eaff342
c05c807
e786253
465d458
6f8036e
bca4cdf
69c3ea2
fa8e221
c06da3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,8 @@ | |
| """ | ||
| from __future__ import division | ||
| from warnings import warn, catch_warnings | ||
| from textwrap import dedent | ||
|
|
||
| import numpy as np | ||
|
|
||
| from pandas.core.dtypes.cast import ( | ||
|
|
@@ -34,7 +36,10 @@ | |
| from pandas.core import common as com | ||
| from pandas._libs import algos, lib, hashtable as htable | ||
| from pandas._libs.tslib import iNaT | ||
| from pandas.util._decorators import deprecate_kwarg | ||
| from pandas.util._decorators import (Appender, Substitution, | ||
| deprecate_kwarg) | ||
|
|
||
| _shared_docs = {} | ||
|
|
||
|
|
||
| # --------------- # | ||
|
|
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original): | |
| Returns | ||
| ------- | ||
| Index for extension types, otherwise ndarray casted to dtype | ||
|
|
||
| """ | ||
| from pandas import Index | ||
| if is_categorical_dtype(dtype): | ||
| if is_extension_array_dtype(dtype): | ||
| pass | ||
| elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): | ||
| values = Index(original)._shallow_copy(values, name=None) | ||
|
|
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, | |
| return labels, uniques | ||
|
|
||
|
|
||
| @deprecate_kwarg(old_arg_name='order', new_arg_name=None) | ||
| def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | ||
| """ | ||
| Encode input values as an enumerated type or categorical variable | ||
| _shared_docs['factorize'] = """ | ||
| Encode the object as an enumerated type or categorical variable. | ||
|
|
||
| This method is useful for obtaining a numeric representation of an | ||
| array when all that matters is identifying distinct values. `factorize` | ||
| is available as both a top-level function :func:`pandas.factorize`, | ||
| and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| values : Sequence | ||
| ndarrays must be 1-D. Sequences that aren't pandas objects are | ||
| coereced to ndarrays before factorization. | ||
| sort : boolean, default False | ||
| Sort by values | ||
| %(values)s%(sort)s%(order)s | ||
| na_sentinel : int, default -1 | ||
| Value to mark "not found" | ||
| size_hint : hint to the hashtable sizer | ||
| Value to mark "not found". | ||
| %(size_hint)s\ | ||
|
|
||
| Returns | ||
| ------- | ||
| labels : the indexer to the original array | ||
| uniques : ndarray (1-d) or Index | ||
| the unique values. Index is returned when passed values is Index or | ||
| Series | ||
| labels : ndarray | ||
| An integer ndarray that's an indexer into `uniques`. | ||
| ``uniques.take(labels)`` will have the same values as `values`. | ||
| uniques : ndarray, Index, or Categorical | ||
| The unique valid values. When `values` is Categorical, `uniques` | ||
| is a Categorical. When `values` is some other pandas object, an | ||
| `Index` is returned. Otherwise, a 1-D ndarray is returned. | ||
|
|
||
| .. note :: | ||
|
|
||
| Even if there's a missing value in `values`, `uniques` will | ||
| *not* contain an entry for it. | ||
|
|
||
| See Also | ||
| -------- | ||
| pandas.cut : Discretize continuous-valued array. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
| pandas.unique : Find the unique valuse in an array. | ||
|
|
||
| Examples | ||
| -------- | ||
| These examples all show factorize as a top-level method like | ||
| ``pd.factorize(values)``. The results are identical for methods like | ||
| :meth:`Series.factorize`. | ||
|
|
||
| >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) | ||
| >>> labels | ||
| array([0, 0, 1, 2, 0]) | ||
| >>> uniques | ||
| array(['b', 'a', 'c'], dtype=object) | ||
|
|
||
| With ``sort=True``, the `uniques` will be sorted, and `labels` will be | ||
| shuffled so that the relationship is the maintained. | ||
|
|
||
| >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) | ||
| >>> labels | ||
| array([1, 1, 0, 2, 1]) | ||
| >>> uniques | ||
| array(['a', 'b', 'c'], dtype=object) | ||
|
|
||
| Missing values are indicated in `labels` with `na_sentinel` | ||
| (``-1`` by default). Note that missing values are never | ||
| included in `uniques`. | ||
|
|
||
| >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) | ||
| >>> labels | ||
| array([ 0, -1, 1, 2, 0]) | ||
| >>> uniques | ||
| array(['b', 'a', 'c'], dtype=object) | ||
|
|
||
| note: an array of Periods will ignore sort as it returns an always sorted | ||
| PeriodIndex. | ||
| Thus far, we've only factorized lists (which are internally coerced to | ||
| NumPy arrays). When factorizing pandas objects, the type of `uniques` | ||
| will differ. For Categoricals, a `Categorical` is returned. | ||
|
|
||
| >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) | ||
| >>> labels, uniques = pd.factorize(cat) | ||
| >>> labels | ||
| array([0, 0, 1]) | ||
| >>> uniques | ||
| [a, c] | ||
| Categories (3, object): [a, b, c] | ||
|
|
||
| Notice that ``'b'`` is in ``uniques.categories``, desipite not being | ||
| present in ``cat.values``. | ||
|
|
||
| For all other pandas objects, an Index of the appropriate type is | ||
| returned. | ||
|
|
||
| >>> cat = pd.Series(['a', 'a', 'c']) | ||
| >>> labels, uniques = pd.factorize(cat) | ||
| >>> labels | ||
| array([0, 0, 1]) | ||
| >>> uniques | ||
| Index(['a', 'c'], dtype='object') | ||
| """ | ||
|
|
||
|
|
||
| @Substitution( | ||
| values=dedent("""\ | ||
| values : sequence | ||
| A 1-D seqeunce. Sequences that aren't pandas objects are | ||
| coereced to ndarrays before factorization. | ||
| """), | ||
| order=dedent("""\ | ||
| order | ||
| .. deprecated:: 0.23.0 | ||
|
|
||
| This parameter has no effect and is deprecated. | ||
| """), | ||
| sort=dedent("""\ | ||
| sort : bool, default False | ||
| Sort `uniques` and shuffle `labels` to maintain the | ||
| relationship. | ||
| """), | ||
| size_hint=dedent("""\ | ||
| size_hint : int, optional | ||
| Hint to the hashtable sizer. | ||
| """), | ||
| ) | ||
| @Appender(_shared_docs['factorize']) | ||
| @deprecate_kwarg(old_arg_name='order', new_arg_name=None) | ||
| def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | ||
| # Implementation notes: This method is responsible for 3 things | ||
| # 1.) coercing data to array-like (ndarray, Index, extension array) | ||
| # 2.) factorizing labels and uniques | ||
|
|
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): | |
| values = _ensure_arraylike(values) | ||
| original = values | ||
|
|
||
| if is_categorical_dtype(values): | ||
| if is_extension_array_dtype(values): | ||
| values = getattr(values, '_values', values) | ||
| labels, uniques = values.factorize() | ||
| labels, uniques = values.factorize(na_sentinel=na_sentinel) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: this was a bug in #19938 where I forgot to pass this through. It's covered by our extension tests. |
||
| dtype = original.dtype | ||
| else: | ||
| values, dtype, _ = _ensure_data(values) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it is only a single entry, can we make this a variable (eg
_shared_docstring_factorize) ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have a slight preference for keeping it as a dictionary, since it looks like the docstrings for
uniqueandvalue_countscan be shared between ops and base. #20390