33intended for public consumption
44"""
55from textwrap import dedent
6- from typing import Dict
6+ from typing import Dict , Optional , Tuple
77from warnings import catch_warnings , simplefilter , warn
88
99import numpy as np
@@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
501501
502502 Returns
503503 -------
504- labels : ndarray
504+ codes : ndarray
505505 An integer ndarray that's an indexer into `uniques`.
506- ``uniques.take(labels )`` will have the same values as `values`.
506+ ``uniques.take(codes )`` will have the same values as `values`.
507507 uniques : ndarray, Index, or Categorical
508508 The unique valid values. When `values` is Categorical, `uniques`
509509 is a Categorical. When `values` is some other pandas object, an
@@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
525525 ``pd.factorize(values)``. The results are identical for methods like
526526 :meth:`Series.factorize`.
527527
528- >>> labels , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529- >>> labels
528+ >>> codes , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529+ >>> codes
530530 array([0, 0, 1, 2, 0])
531531 >>> uniques
532532 array(['b', 'a', 'c'], dtype=object)
533533
534- With ``sort=True``, the `uniques` will be sorted, and `labels ` will be
534+ With ``sort=True``, the `uniques` will be sorted, and `codes ` will be
535535 shuffled so that the relationship is the maintained.
536536
537- >>> labels , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538- >>> labels
537+ >>> codes , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538+ >>> codes
539539 array([1, 1, 0, 2, 1])
540540 >>> uniques
541541 array(['a', 'b', 'c'], dtype=object)
542542
543- Missing values are indicated in `labels ` with `na_sentinel`
543+ Missing values are indicated in `codes ` with `na_sentinel`
544544 (``-1`` by default). Note that missing values are never
545545 included in `uniques`.
546546
547- >>> labels , uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548- >>> labels
547+ >>> codes , uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548+ >>> codes
549549 array([ 0, -1, 1, 2, 0])
550550 >>> uniques
551551 array(['b', 'a', 'c'], dtype=object)
@@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
555555 will differ. For Categoricals, a `Categorical` is returned.
556556
557557 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
558- >>> labels , uniques = pd.factorize(cat)
559- >>> labels
558+ >>> codes , uniques = pd.factorize(cat)
559+ >>> codes
560560 array([0, 0, 1])
561561 >>> uniques
562562 [a, c]
@@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
569569 returned.
570570
571571 >>> cat = pd.Series(['a', 'a', 'c'])
572- >>> labels , uniques = pd.factorize(cat)
573- >>> labels
572+ >>> codes , uniques = pd.factorize(cat)
573+ >>> codes
574574 array([0, 0, 1])
575575 >>> uniques
576576 Index(['a', 'c'], dtype='object')
@@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
596596 sort = dedent (
597597 """\
598598 sort : bool, default False
599- Sort `uniques` and shuffle `labels ` to maintain the
599+ Sort `uniques` and shuffle `codes ` to maintain the
600600 relationship.
601601 """
602602 ),
@@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
609609)
610610@Appender (_shared_docs ["factorize" ])
611611@deprecate_kwarg (old_arg_name = "order" , new_arg_name = None )
612- def factorize (values , sort : bool = False , order = None , na_sentinel = - 1 , size_hint = None ):
612+ def factorize (
613+ values ,
614+ sort : bool = False ,
615+ order = None ,
616+ na_sentinel : int = - 1 ,
617+ size_hint : Optional [int ] = None ,
618+ ) -> Tuple [np .ndarray , ABCIndex ]:
613619 # Implementation notes: This method is responsible for 3 things
614620 # 1.) coercing data to array-like (ndarray, Index, extension array)
615- # 2.) factorizing labels and uniques
616- # 3.) Maybe boxing the output in an Index
621+ # 2.) factorizing codes and uniques
622+ # 3.) Maybe boxing the uniques in an Index
617623 #
618624 # Step 2 is dispatched to extension types (like Categorical). They are
619625 # responsible only for factorization. All data coercion, sorting and boxing
@@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
624630
625631 if is_extension_array_dtype (values ):
626632 values = extract_array (values )
627- labels , uniques = values .factorize (na_sentinel = na_sentinel )
633+ codes , uniques = values .factorize (na_sentinel = na_sentinel )
628634 dtype = original .dtype
629635 else :
630636 values , dtype = _ensure_data (values )
@@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
634640 else :
635641 na_value = None
636642
637- labels , uniques = _factorize_array (
643+ codes , uniques = _factorize_array (
638644 values , na_sentinel = na_sentinel , size_hint = size_hint , na_value = na_value
639645 )
640646
641647 if sort and len (uniques ) > 0 :
642- uniques , labels = safe_sort (
643- uniques , labels , na_sentinel = na_sentinel , assume_unique = True , verify = False
648+ uniques , codes = safe_sort (
649+ uniques , codes , na_sentinel = na_sentinel , assume_unique = True , verify = False
644650 )
645651
646652 uniques = _reconstruct_data (uniques , dtype , original )
@@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
653659
654660 uniques = Index (uniques )
655661
656- return labels , uniques
662+ return codes , uniques
657663
658664
659665def value_counts (
0 commit comments