11import numbers
2- from typing import TYPE_CHECKING , Type
2+ from typing import TYPE_CHECKING , Any , Tuple , Type
33import warnings
44
55import numpy as np
66
7- from pandas ._libs import lib
7+ from pandas ._libs import lib , missing as libmissing
88from pandas .compat import set_function_name
99
1010from pandas .core .dtypes .base import ExtensionDtype
@@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype):
6161 @property
6262 def na_value (self ) -> "Scalar" :
6363 """
64- BooleanDtype uses :attr:`numpy.nan ` as the missing NA value.
64+ BooleanDtype uses :attr:`pandas.NA ` as the missing NA value.
6565
6666 .. warning::
6767
6868 `na_value` may change in a future release.
6969 """
70- return np . nan
70+ return libmissing . NA
7171
7272 @property
7373 def type (self ) -> Type :
@@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
223223
224224 >>> pd.array([True, False, None], dtype="boolean")
225225 <BooleanArray>
226- [True, False, NaN ]
226+ [True, False, NA ]
227227 Length: 3, dtype: boolean
228228 """
229229
@@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
262262 values , mask = coerce_to_array (scalars , copy = copy )
263263 return BooleanArray (values , mask )
264264
265+ def _values_for_factorize (self ) -> Tuple [np .ndarray , Any ]:
266+ data = self ._data .astype ("int8" )
267+ data [self ._mask ] = - 1
268+ return data , - 1
269+
265270 @classmethod
266271 def _from_factorized (cls , values , original : "BooleanArray" ):
267272 return cls ._from_sequence (values , dtype = original .dtype )
268273
269274 def _formatter (self , boxed = False ):
270- def fmt (x ):
271- if isna (x ):
272- return "NaN"
273- return str (x )
274-
275- return fmt
275+ return str
276276
277277 def __getitem__ (self , item ):
278278 if is_integer (item ):
@@ -281,25 +281,29 @@ def __getitem__(self, item):
281281 return self ._data [item ]
282282 return type (self )(self ._data [item ], self ._mask [item ])
283283
284- def _coerce_to_ndarray (self , force_bool : bool = False ):
284+ def _coerce_to_ndarray (self , dtype = None , na_value : "Scalar" = libmissing . NA ):
285285 """
286286 Coerce to an ndarary of object dtype or bool dtype (if force_bool=True).
287287
288288 Parameters
289289 ----------
290- force_bool : bool, default False
291- If True, return bool array or raise error if not possible (in
292- presence of missing values)
290+ dtype : dtype, default object
291+ The numpy dtype to convert to
292+ na_value : scalar, optional
293+ Scalar missing value indicator to use in numpy array. Defaults
294+ to the native missing value indicator of this array (pd.NA).
293295 """
294- if force_bool :
296+ if dtype is None :
297+ dtype = object
298+ if is_bool_dtype (dtype ):
295299 if not self .isna ().any ():
296300 return self ._data
297301 else :
298302 raise ValueError (
299303 "cannot convert to bool numpy array in presence of missing values"
300304 )
301- data = self ._data .astype (object )
302- data [self ._mask ] = self . _na_value
305+ data = self ._data .astype (dtype )
306+ data [self ._mask ] = na_value
303307 return data
304308
305309 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
@@ -309,15 +313,8 @@ def __array__(self, dtype=None):
309313 the array interface, return my values
310314 We return an object array here to preserve our scalar values
311315 """
312- if dtype is not None :
313- if is_bool_dtype (dtype ):
314- return self ._coerce_to_ndarray (force_bool = True )
315- # TODO can optimize this to not go through object dtype for
316- # numeric dtypes
317- arr = self ._coerce_to_ndarray ()
318- return arr .astype (dtype , copy = False )
319316 # by default (no dtype specified), return an object array
320- return self ._coerce_to_ndarray ()
317+ return self ._coerce_to_ndarray (dtype = dtype )
321318
322319 def __arrow_array__ (self , type = None ):
323320 """
@@ -483,8 +480,17 @@ def astype(self, dtype, copy=True):
483480 return IntegerArray (
484481 self ._data .astype (dtype .numpy_dtype ), self ._mask .copy (), copy = False
485482 )
483+ # for integer, error if there are missing values
484+ if is_integer_dtype (dtype ):
485+ if self .isna ().any ():
486+ raise ValueError ("cannot convert NA to integer" )
487+ # for float dtype, ensure we use np.nan before casting (numpy cannot
488+ # deal with pd.NA)
489+ na_value = self ._na_value
490+ if is_float_dtype (dtype ):
491+ na_value = np .nan
486492 # coerce
487- data = self ._coerce_to_ndarray ()
493+ data = self ._coerce_to_ndarray (na_value = na_value )
488494 return astype_nansafe (data , dtype , copy = None )
489495
490496 def value_counts (self , dropna = True ):
@@ -594,8 +600,6 @@ def logical_method(self, other):
594600
595601 @classmethod
596602 def _create_comparison_method (cls , op ):
597- op_name = op .__name__
598-
599603 def cmp_method (self , other ):
600604
601605 if isinstance (other , (ABCDataFrame , ABCSeries , ABCIndexClass )):
@@ -617,21 +621,26 @@ def cmp_method(self, other):
617621 if len (self ) != len (other ):
618622 raise ValueError ("Lengths must match to compare" )
619623
620- # numpy will show a DeprecationWarning on invalid elementwise
621- # comparisons, this will raise in the future
622- with warnings .catch_warnings ():
623- warnings .filterwarnings ("ignore" , "elementwise" , FutureWarning )
624- with np .errstate (all = "ignore" ):
625- result = op (self ._data , other )
626-
627- # nans propagate
628- if mask is None :
629- mask = self ._mask
624+ if other is libmissing .NA :
625+ # numpy does not handle pd.NA well as "other" scalar (it returns
626+ # a scalar False instead of an array)
627+ result = np .zeros_like (self ._data )
628+ mask = np .ones_like (self ._data )
630629 else :
631- mask = self ._mask | mask
630+ # numpy will show a DeprecationWarning on invalid elementwise
631+ # comparisons, this will raise in the future
632+ with warnings .catch_warnings ():
633+ warnings .filterwarnings ("ignore" , "elementwise" , FutureWarning )
634+ with np .errstate (all = "ignore" ):
635+ result = op (self ._data , other )
636+
637+ # nans propagate
638+ if mask is None :
639+ mask = self ._mask .copy ()
640+ else :
641+ mask = self ._mask | mask
632642
633- result [mask ] = op_name == "ne"
634- return BooleanArray (result , np .zeros (len (result ), dtype = bool ), copy = False )
643+ return BooleanArray (result , mask , copy = False )
635644
636645 name = "__{name}__" .format (name = op .__name__ )
637646 return set_function_name (cmp_method , name , cls )
@@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs):
643652 # coerce to a nan-aware float if needed
644653 if mask .any ():
645654 data = self ._data .astype ("float64" )
646- data [mask ] = self . _na_value
655+ data [mask ] = np . nan
647656
648657 op = getattr (nanops , "nan" + name )
649658 result = op (data , axis = 0 , skipna = skipna , mask = mask , ** kwargs )
0 commit comments