- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 19.2k
DEPR: na_sentinel in factorize #47157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
552775f
              79231e7
              c822282
              05fa0ca
              f626dd8
              a15e43a
              9a33637
              46e7a8d
              b1edc89
              c8d6fa2
              0fd1ea7
              465ab2b
              6b4917c
              d8e3d6b
              0111eef
              39b3747
              5842053
              945bb04
              8f8cb18
              9630edc
              5524d53
              fcd8a75
              c615de4
              80aaf5e
              dc97a7b
              eb6d5a1
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -80,6 +80,7 @@ | |
| na_value_for_dtype, | ||
| ) | ||
|  | ||
| from pandas.core import common as com | ||
| from pandas.core.array_algos.take import take_nd | ||
| from pandas.core.construction import ( | ||
| array as pd_array, | ||
|  | @@ -580,7 +581,8 @@ def factorize_array( | |
| def factorize( | ||
| values, | ||
| sort: bool = False, | ||
| na_sentinel: int | None = -1, | ||
| na_sentinel: int | None | lib.NoDefault = lib.no_default, | ||
| use_na_sentinel: bool | lib.NoDefault = lib.no_default, | ||
| size_hint: int | None = None, | ||
| ) -> tuple[np.ndarray, np.ndarray | Index]: | ||
| """ | ||
|  | @@ -598,7 +600,19 @@ def factorize( | |
| Value to mark "not found". If None, will not drop the NaN | ||
| from the uniques of the values. | ||
|  | ||
| .. deprecated:: 1.5.0 | ||
| The na_sentinel argument is deprecated and | ||
| will be removed in a future version of pandas. Specify use_na_sentinel as | ||
| either True or False. | ||
|  | ||
| .. versionchanged:: 1.1.2 | ||
|  | ||
| use_na_sentinel : bool, default True | ||
| If True, the sentinel -1 will be used for NaN values. If False, | ||
| NaN values will be encoded as non-negative integers and will not drop the | ||
| NaN from the uniques of the values. | ||
|  | ||
| .. versionadded:: 1.5.0 | ||
| {size_hint}\ | ||
|  | ||
| Returns | ||
|  | @@ -646,8 +660,8 @@ def factorize( | |
| >>> uniques | ||
| array(['a', 'b', 'c'], dtype=object) | ||
|  | ||
| Missing values are indicated in `codes` with `na_sentinel` | ||
| (``-1`` by default). Note that missing values are never | ||
| When ``use_na_sentinel=True`` (the default), missing values are indicated in | ||
| the `codes` with the sentinel value ``-1`` and missing values are not | ||
| included in `uniques`. | ||
|  | ||
| >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) | ||
|  | @@ -682,16 +696,16 @@ def factorize( | |
| Index(['a', 'c'], dtype='object') | ||
|  | ||
| If NaN is in the values, and we want to include NaN in the uniques of the | ||
| values, it can be achieved by setting ``na_sentinel=None``. | ||
| values, it can be achieved by setting ``use_na_sentinel=False``. | ||
|  | ||
| >>> values = np.array([1, 2, 1, np.nan]) | ||
| >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 | ||
| >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True | ||
| >>> codes | ||
| array([ 0, 1, 0, -1]) | ||
| >>> uniques | ||
| array([1., 2.]) | ||
|  | ||
| >>> codes, uniques = pd.factorize(values, na_sentinel=None) | ||
| >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) | ||
| >>> codes | ||
| array([0, 1, 0, 2]) | ||
| >>> uniques | ||
|  | @@ -706,7 +720,13 @@ def factorize( | |
| # responsible only for factorization. All data coercion, sorting and boxing | ||
| # should happen here. | ||
|  | ||
| # Can't always warn here because EA's factorize will warn too; warn for each | ||
| # path below. | ||
| passed_na_sentinel = na_sentinel | ||
| na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False) | ||
| if isinstance(values, ABCRangeIndex): | ||
| # Emit warning if appropriate | ||
| _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
| return values.factorize(sort=sort) | ||
|  | ||
| values = _ensure_arraylike(values) | ||
|  | @@ -725,15 +745,30 @@ def factorize( | |
| isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) | ||
| and values.freq is not None | ||
| ): | ||
| # Emit warning if appropriate | ||
| _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
| # The presence of 'freq' means we can fast-path sorting and know there | ||
| # aren't NAs | ||
| codes, uniques = values.factorize(sort=sort) | ||
| return _re_wrap_factorize(original, uniques, codes) | ||
|  | ||
| if not isinstance(values.dtype, np.dtype): | ||
| elif not isinstance(values.dtype, np.dtype): | ||
| # i.e. ExtensionDtype | ||
| codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
| if passed_na_sentinel is lib.no_default: | ||
| # User didn't specify na_sentinel; avoid warning. Note EA path always | ||
| # uses a na_sentinel value. | ||
| codes, uniques = values.factorize(use_na_sentinel=True) | ||
| elif passed_na_sentinel is None: | ||
| # Emit the appropriate warning message for None | ||
| _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
| codes, uniques = values.factorize(use_na_sentinel=True) | ||
| else: | ||
| # EA.factorize will warn | ||
| codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
|          | ||
|  | ||
| else: | ||
| # Generate warning for na_sentile if appropriate | ||
| _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel) | ||
| values = np.asarray(values) # convert DTA/TDA/MultiIndex | ||
| codes, uniques = factorize_array( | ||
| values, na_sentinel=na_sentinel, size_hint=size_hint | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -704,3 +704,57 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool = | |
| ) | ||
|  | ||
| warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) | ||
|  | ||
|  | ||
| def resolve_na_sentinel( | ||
|          | ||
| na_sentinel: int | None | lib.NoDefault, | ||
| use_na_sentinel: bool | lib.NoDefault, | ||
| warn: bool = True, | ||
| ) -> int | None: | ||
| """ | ||
| Determine value of na_sentinel for factorize methods. | ||
|  | ||
| See GH#46910 for details on the deprecation. | ||
|  | ||
| Parameters | ||
| ---------- | ||
| na_sentinel : int, None, or lib.no_default | ||
| Value passed to the method. | ||
| use_na_sentinel : bool or lib.no_default | ||
| Value passed to the method. | ||
| warn : bool, default True | ||
| Whether to emit a warning if a deprecated use is detected. | ||
|  | ||
| Returns | ||
| ------- | ||
| Resolved value of na_sentinel. | ||
| """ | ||
| if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: | ||
| raise ValueError( | ||
| "Cannot specify both na_sentinel and use_na_sentile; " | ||
| f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}" | ||
| ) | ||
| if na_sentinel is lib.no_default: | ||
| result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None | ||
| else: | ||
| if warn: | ||
| if na_sentinel is None: | ||
| msg = ( | ||
| "Specifying na_sentinel=None is deprecated, specify " | ||
| "use_na_sentinel=False instead." | ||
| ) | ||
| elif na_sentinel == -1: | ||
| msg = ( | ||
| "Specifying na_sentinel=-1 is deprecated, specify " | ||
| "use_na_sentinel=True instead." | ||
| ) | ||
| else: | ||
| msg = ( | ||
| "Specifying the specific value to use for na_sentinel is " | ||
| "deprecated and will be removed in a future version of pandas. " | ||
| "Specify na_sentinel=True to use the sentinel value -1, and " | ||
| "na_sentinel=False to encode NaN values." | ||
| ) | ||
| warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) | ||
| result = na_sentinel | ||
| return result | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took this to mean use backticks for any argument / code in the warnings. I went and implemented that for all warnings.