44""" 
55from  __future__ import  division 
66from  warnings  import  warn , catch_warnings 
7+ from  textwrap  import  dedent 
8+ 
79import  numpy  as  np 
810
911from  pandas .core .dtypes .cast  import  (
3436from  pandas .core  import  common  as  com 
3537from  pandas ._libs  import  algos , lib , hashtable  as  htable 
3638from  pandas ._libs .tslib  import  iNaT 
37- from  pandas .util ._decorators  import  deprecate_kwarg 
39+ from  pandas .util ._decorators  import  (Appender , Substitution ,
40+                                      deprecate_kwarg )
41+ 
42+ _shared_docs  =  {}
3843
3944
4045# --------------- # 
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original):
146151    Returns 
147152    ------- 
148153    Index for extension types, otherwise ndarray casted to dtype 
149- 
150154    """ 
151155    from  pandas  import  Index 
152-     if  is_categorical_dtype (dtype ):
156+     if  is_extension_array_dtype (dtype ):
153157        pass 
154158    elif  is_datetime64tz_dtype (dtype ) or  is_period_dtype (dtype ):
155159        values  =  Index (original )._shallow_copy (values , name = None )
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
469473    return  labels , uniques 
470474
471475
472- @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None ) 
473- def  factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
474-     """ 
475-     Encode input values as an enumerated type or categorical variable 
476+ _shared_docs ['factorize' ] =  """ 
477+     Encode the object as an enumerated type or categorical variable. 
478+ 
479+     This method is useful for obtaining a numeric representation of an 
480+     array when all that matters is identifying distinct values. `factorize` 
481+     is available as both a top-level function :func:`pandas.factorize`, 
482+     and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 
476483
477484    Parameters 
478485    ---------- 
479-     values : Sequence 
480-         ndarrays must be 1-D. Sequences that aren't pandas objects are 
481-         coereced to ndarrays before factorization. 
482-     sort : boolean, default False 
483-         Sort by values 
486+     %(values)s%(sort)s%(order)s 
484487    na_sentinel : int, default -1 
485-         Value to mark "not found" 
486-     size_hint : hint to the hashtable sizer  
488+         Value to mark "not found".  
489+     %( size_hint)s \  
487490
488491    Returns 
489492    ------- 
490-     labels : the indexer to the original array 
491-     uniques : ndarray (1-d) or Index 
492-         the unique values. Index is returned when passed values is Index or 
493-         Series 
493+     labels : ndarray 
494+         An integer ndarray that's an indexer into `uniques`. 
495+         ``uniques.take(labels)`` will have the same values as `values`. 
496+     uniques : ndarray, Index, or Categorical 
497+         The unique valid values. When `values` is Categorical, `uniques` 
498+         is a Categorical. When `values` is some other pandas object, an 
499+         `Index` is returned. Otherwise, a 1-D ndarray is returned. 
500+ 
501+         .. note :: 
502+ 
503+            Even if there's a missing value in `values`, `uniques` will 
504+            *not* contain an entry for it. 
505+ 
506+     See Also 
507+     -------- 
508+     pandas.cut : Discretize continuous-valued array. 
509+     pandas.unique : Find the unique valuse in an array. 
510+ 
511+     Examples 
512+     -------- 
513+     These examples all show factorize as a top-level method like 
514+     ``pd.factorize(values)``. The results are identical for methods like 
515+     :meth:`Series.factorize`. 
516+ 
517+     >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) 
518+     >>> labels 
519+     array([0, 0, 1, 2, 0]) 
520+     >>> uniques 
521+     array(['b', 'a', 'c'], dtype=object) 
522+ 
523+     With ``sort=True``, the `uniques` will be sorted, and `labels` will be 
524+     shuffled so that the relationship is the maintained. 
525+ 
526+     >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) 
527+     >>> labels 
528+     array([1, 1, 0, 2, 1]) 
529+     >>> uniques 
530+     array(['a', 'b', 'c'], dtype=object) 
531+ 
532+     Missing values are indicated in `labels` with `na_sentinel` 
533+     (``-1`` by default). Note that missing values are never 
534+     included in `uniques`. 
535+ 
536+     >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) 
537+     >>> labels 
538+     array([ 0, -1,  1,  2,  0]) 
539+     >>> uniques 
540+     array(['b', 'a', 'c'], dtype=object) 
494541
495-     note: an array of Periods will ignore sort as it returns an always sorted 
496-     PeriodIndex. 
542+     Thus far, we've only factorized lists (which are internally coerced to 
543+     NumPy arrays). When factorizing pandas objects, the type of `uniques` 
544+     will differ. For Categoricals, a `Categorical` is returned. 
545+ 
546+     >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 
547+     >>> labels, uniques = pd.factorize(cat) 
548+     >>> labels 
549+     array([0, 0, 1]) 
550+     >>> uniques 
551+     [a, c] 
552+     Categories (3, object): [a, b, c] 
553+ 
554+     Notice that ``'b'`` is in ``uniques.categories``, desipite not being 
555+     present in ``cat.values``. 
556+ 
557+     For all other pandas objects, an Index of the appropriate type is 
558+     returned. 
559+ 
560+     >>> cat = pd.Series(['a', 'a', 'c']) 
561+     >>> labels, uniques = pd.factorize(cat) 
562+     >>> labels 
563+     array([0, 0, 1]) 
564+     >>> uniques 
565+     Index(['a', 'c'], dtype='object') 
497566    """ 
567+ 
568+ 
569+ @Substitution ( 
570+     values = dedent ("""\   
571+ 
572+         A 1-D seqeunce. Sequences that aren't pandas objects are 
573+         coereced to ndarrays before factorization. 
574+     """ ),
575+     order = dedent ("""\   
576+ 
577+         .. deprecated:: 0.23.0 
578+ 
579+            This parameter has no effect and is deprecated. 
580+     """ ),
581+     sort = dedent ("""\   
582+ 
583+         Sort `uniques` and shuffle `labels` to maintain the 
584+         relationship. 
585+     """ ),
586+     size_hint = dedent ("""\   
587+ 
588+         Hint to the hashtable sizer. 
589+     """ ),
590+ ) 
591+ @Appender (_shared_docs ['factorize' ]) 
592+ @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None ) 
593+ def  factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
498594    # Implementation notes: This method is responsible for 3 things 
499595    # 1.) coercing data to array-like (ndarray, Index, extension array) 
500596    # 2.) factorizing labels and uniques 
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
507603    values  =  _ensure_arraylike (values )
508604    original  =  values 
509605
510-     if  is_categorical_dtype (values ):
606+     if  is_extension_array_dtype (values ):
511607        values  =  getattr (values , '_values' , values )
512-         labels , uniques  =  values .factorize ()
608+         labels , uniques  =  values .factorize (na_sentinel = na_sentinel )
513609        dtype  =  original .dtype 
514610    else :
515611        values , dtype , _  =  _ensure_data (values )
0 commit comments