|
26 | 26 | def cut(x, bins, right=True, labels=None, retbins=False, precision=3, |
27 | 27 | include_lowest=False): |
28 | 28 | """ |
29 | | - Return indices of half-open bins to which each value of `x` belongs. |
| 29 | + Bin values into discrete intervals. |
| 30 | +
|
| 31 | + Use `cut` when you need to segment and sort data values into bins. This |
| 32 | + function is also useful for going from a continuous variable to a |
| 33 | + categorical variable. For example, `cut` could convert ages to groups of |
| 34 | + age ranges. Supports binning into an equal number of bins, or a |
| 35 | + pre-specified array of bins. |
30 | 36 |
|
31 | 37 | Parameters |
32 | 38 | ---------- |
33 | 39 | x : array-like |
34 | | - Input array to be binned. It has to be 1-dimensional. |
35 | | - bins : int, sequence of scalars, or IntervalIndex |
36 | | - If `bins` is an int, it defines the number of equal-width bins in the |
37 | | - range of `x`. However, in this case, the range of `x` is extended |
38 | | - by .1% on each side to include the min or max values of `x`. If |
39 | | - `bins` is a sequence it defines the bin edges allowing for |
40 | | - non-uniform bin width. No extension of the range of `x` is done in |
41 | | - this case. |
42 | | - right : bool, optional |
43 | | - Indicates whether the bins include the rightmost edge or not. If |
44 | | - right == True (the default), then the bins [1,2,3,4] indicate |
45 | | - (1,2], (2,3], (3,4]. |
46 | | - labels : array or boolean, default None |
47 | | - Used as labels for the resulting bins. Must be of the same length as |
48 | | - the resulting bins. If False, return only integer indicators of the |
49 | | - bins. |
50 | | - retbins : bool, optional |
51 | | - Whether to return the bins or not. Can be useful if bins is given |
| 40 | + The input array to be binned. Must be 1-dimensional. |
| 41 | + bins : int, sequence of scalars, or pandas.IntervalIndex |
| 42 | + The criteria to bin by. |
| 43 | +
|
| 44 | + * int : Defines the number of equal-width bins in the range of `x`. The |
| 45 | + range of `x` is extended by .1% on each side to include the minimum |
| 46 | + and maximum values of `x`. |
| 47 | + * sequence of scalars : Defines the bin edges allowing for non-uniform |
| 48 | + width. No extension of the range of `x` is done. |
| 49 | + * IntervalIndex : Defines the exact bins to be used. |
| 50 | +
|
| 51 | + right : bool, default True |
| 52 | + Indicates whether `bins` includes the rightmost edge or not. If |
| 53 | + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` |
| 54 | + indicate (1,2], (2,3], (3,4]. This argument is ignored when |
| 55 | + `bins` is an IntervalIndex. |
| 56 | + labels : array or bool, optional |
| 57 | + Specifies the labels for the returned bins. Must be the same length as |
| 58 | + the resulting bins. If False, returns only integer indicators of the |
| 59 | + bins. This affects the type of the output container (see below). |
| 60 | + This argument is ignored when `bins` is an IntervalIndex. |
| 61 | + retbins : bool, default False |
| 62 | + Whether to return the bins or not. Useful when bins is provided |
52 | 63 | as a scalar. |
53 | | - precision : int, optional |
54 | | - The precision at which to store and display the bins labels |
55 | | - include_lowest : bool, optional |
| 64 | + precision : int, default 3 |
| 65 | + The precision at which to store and display the bins labels. |
| 66 | + include_lowest : bool, default False |
56 | 67 | Whether the first interval should be left-inclusive or not. |
57 | 68 |
|
58 | 69 | Returns |
59 | 70 | ------- |
60 | | - out : Categorical or Series or array of integers if labels is False |
61 | | - The return type (Categorical or Series) depends on the input: a Series |
62 | | - of type category if input is a Series else Categorical. Bins are |
63 | | - represented as categories when categorical data is returned. |
64 | | - bins : ndarray of floats |
65 | | - Returned only if `retbins` is True. |
| 71 | + out : pandas.Categorical, Series, or ndarray |
| 72 | + An array-like object representing the respective bin for each value |
| 73 | + of `x`. The type depends on the value of `labels`. |
66 | 74 |
|
67 | | - Notes |
68 | | - ----- |
69 | | - The `cut` function can be useful for going from a continuous variable to |
70 | | - a categorical variable. For example, `cut` could convert ages to groups |
71 | | - of age ranges. |
| 75 | + * True (default) : returns a Series for Series `x` or a |
| 76 | + pandas.Categorical for all other inputs. The values stored within |
| 77 | + are Interval dtype. |
72 | 78 |
|
73 | | - Any NA values will be NA in the result. Out of bounds values will be NA in |
74 | | - the resulting Categorical object |
| 79 | + * sequence of scalars : returns a Series for Series `x` or a |
| 80 | + pandas.Categorical for all other inputs. The values stored within |
| 81 | + are whatever the type in the sequence is. |
75 | 82 |
|
| 83 | + * False : returns an ndarray of integers. |
| 84 | +
|
| 85 | + bins : numpy.ndarray or IntervalIndex. |
| 86 | + The computed or specified bins. Only returned when `retbins=True`. |
| 87 | + For scalar or sequence `bins`, this is an ndarray with the computed |
| 88 | + bins. For an IntervalIndex `bins`, this is equal to `bins`. |
| 89 | +
|
| 90 | + See Also |
| 91 | + -------- |
| 92 | + qcut : Discretize variable into equal-sized buckets based on rank |
| 93 | + or based on sample quantiles. |
| 94 | + pandas.Categorical : Array type for storing data that come from a |
| 95 | + fixed set of values. |
| 96 | + Series : One-dimensional array with axis labels (including time series). |
| 97 | + pandas.IntervalIndex : Immutable Index implementing an ordered, |
| 98 | + sliceable set. |
| 99 | +
|
| 100 | + Notes |
| 101 | + ----- |
| 102 | + Any NA values will be NA in the result. Out of bounds values will be NA in |
| 103 | + the resulting Series or pandas.Categorical object. |
76 | 104 |
|
77 | 105 | Examples |
78 | 106 | -------- |
79 | | - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) |
| 107 | + Discretize into three equal-sized bins. |
| 108 | +
|
| 109 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) |
80 | 110 | ... # doctest: +ELLIPSIS |
81 | | - ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... |
82 | | - Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... |
| 111 | + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... |
| 112 | + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... |
83 | 113 |
|
84 | | - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), |
85 | | - ... 3, labels=["good", "medium", "bad"]) |
86 | | - ... # doctest: +SKIP |
87 | | - [good, good, good, medium, bad, good] |
88 | | - Categories (3, object): [good < medium < bad] |
| 114 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) |
| 115 | + ... # doctest: +ELLIPSIS |
| 116 | + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... |
| 117 | + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... |
| 118 | + array([0.994, 3. , 5. , 7. ])) |
| 119 | +
|
| 120 | + Discovers the same bins, but assign them specific labels. Notice that |
| 121 | + the returned Categorical's categories are `labels` and is ordered. |
| 122 | +
|
| 123 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), |
| 124 | + ... 3, labels=["bad", "medium", "good"]) |
| 125 | + [bad, good, medium, medium, good, bad] |
| 126 | + Categories (3, object): [bad < medium < good] |
89 | 127 |
|
90 | | - >>> pd.cut(np.ones(5), 4, labels=False) |
91 | | - array([1, 1, 1, 1, 1]) |
| 128 | + ``labels=False`` implies you just want the bins back. |
| 129 | +
|
| 130 | + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) |
| 131 | + array([0, 1, 1, 3]) |
| 132 | +
|
| 133 | + Passing a Series as an input returns a Series with categorical dtype: |
| 134 | +
|
| 135 | + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), |
| 136 | + ... index=['a', 'b', 'c', 'd', 'e']) |
| 137 | + >>> pd.cut(s, 3) |
| 138 | + ... # doctest: +ELLIPSIS |
| 139 | + a (1.992, 4.667] |
| 140 | + b (1.992, 4.667] |
| 141 | + c (4.667, 7.333] |
| 142 | + d (7.333, 10.0] |
| 143 | + e (7.333, 10.0] |
| 144 | + dtype: category |
| 145 | + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... |
| 146 | +
|
| 147 | + Passing an IntervalIndex for `bins` results in those categories exactly. |
| 148 | + Notice that values not covered by the IntervalIndex are set to NaN. 0 |
| 149 | + is to the left of the first bin (which is closed on the right), and 1.5 |
| 150 | + falls between two bins. |
| 151 | +
|
| 152 | + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) |
| 153 | + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) |
| 154 | + [NaN, (0, 1], NaN, (2, 3], (4, 5]] |
| 155 | + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] |
92 | 156 | """ |
93 | 157 | # NOTE: this binning code is changed a bit from histogram for var(x) == 0 |
94 | 158 |
|
|
0 commit comments