11import operator
2- from typing import TYPE_CHECKING , Optional , Type , Union
2+ from typing import TYPE_CHECKING , Type , Union
33
44import numpy as np
55
@@ -122,9 +122,6 @@ class StringArray(PandasArray):
122122
123123 copy : bool, default False
124124 Whether to copy the array of data.
125- convert : bool, default False
126- If true, force conversion of non-na scalars to strings.
127- If False, raises a ValueError, if a scalar is neither a string nor na.
128125
129126 Attributes
130127 ----------
@@ -165,15 +162,7 @@ class StringArray(PandasArray):
165162 ['1', '1']
166163 Length: 2, dtype: string
167164
168- Instantiating StringArrays directly with non-strings arrays will raise an error
169- unless ``convert=True``.
170-
171- >>> pd.arrays.StringArray(np.array(['1', 1]))
172- ValueError: StringArray requires a sequence of strings or pandas.NA
173- >>> pd.arrays.StringArray(['1', 1], convert=True)
174- <StringArray>
175- ['1', '1']
176- Length: 2, dtype: string
165+ However, instantiating StringArrays directly with non-strings will raise an error.
177166
178167 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
179168
@@ -186,29 +175,22 @@ class StringArray(PandasArray):
186175 # undo the PandasArray hack
187176 _typ = "extension"
188177
189- def __init__ (self , values , copy = False , convert : bool = False ):
178+ def __init__ (self , values , copy = False ):
190179 values = extract_array (values )
191- if not isinstance (values , type (self )):
192- if convert :
193- na_val = StringDtype .na_value
194- values = lib .ensure_string_array (values , na_value = na_val , copy = copy )
195- else :
196- self ._validate (values )
197180
198181 super ().__init__ (values , copy = copy )
199182 self ._dtype = StringDtype ()
183+ if not isinstance (values , type (self )):
184+ self ._validate ()
200185
201- def _validate (self , values : Optional [ np . ndarray ] = None ) -> None :
186+ def _validate (self ) :
202187 """Validate that we only store NA or strings."""
203- if values is None :
204- values = self ._ndarray
205-
206- if len (values ) and not lib .is_string_array (values , skipna = True ):
188+ if len (self ._ndarray ) and not lib .is_string_array (self ._ndarray , skipna = True ):
207189 raise ValueError ("StringArray requires a sequence of strings or pandas.NA" )
208- if values .dtype != "object" :
190+ if self . _ndarray .dtype != "object" :
209191 raise ValueError (
210192 "StringArray requires a sequence of strings or pandas.NA. Got "
211- f"'{ values .dtype } ' dtype instead."
193+ f"'{ self . _ndarray .dtype } ' dtype instead."
212194 )
213195
214196 @classmethod
@@ -217,8 +199,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
217199 assert dtype == "string"
218200
219201 result = np .asarray (scalars , dtype = "object" )
220-
221- return cls (result , copy = copy , convert = True )
202+ # convert non-na-likes to str, and nan-likes to StringDtype.na_value
203+ result = lib .ensure_string_array (
204+ result , na_value = StringDtype .na_value , copy = copy
205+ )
206+
207+ # Manually creating new array avoids the validation step in the __init__, so is
208+ # faster. Refactor need for validation?
209+ new_string_array = object .__new__ (cls )
210+ new_string_array ._dtype = StringDtype ()
211+ new_string_array ._ndarray = result
212+
213+ return new_string_array
222214
223215 @classmethod
224216 def _from_sequence_of_strings (cls , strings , dtype = None , copy = False ):
0 commit comments