-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Support nan-ops for object-typed arrays #1883
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
1b9c05f
4f2f209
4c45504
e01d0f8
de9c05c
ebeea79
bb3b3b0
d194a8c
33724f4
9616915
670ae8a
4da55c4
9fb1715
de1c613
63c0a9d
3d0e22a
c31fb49
4dcc1aa
f93a618
2c32342
28f0e0a
e46d07d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -171,6 +171,83 @@ def _ignore_warnings_if(condition): | |||||||||
| yield | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _nansum_object(value, axis=None, **kwargs): | ||||||||||
| """ In house nansum for object array """ | ||||||||||
| value = fillna(value, 0) | ||||||||||
| return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs): | ||||||||||
| """ In house nanmin and nanmax for object array """ | ||||||||||
| if fill_value_typ == '+inf': | ||||||||||
|
||||||||||
| fill_value = dtypes.get_pos_infinity(value.dtype) | ||||||||||
| else: | ||||||||||
| fill_value = dtypes.get_neg_infinity(value.dtype) | ||||||||||
| valid_count = count(value, axis=axis) | ||||||||||
| filled_value = fillna(value, fill_value) | ||||||||||
| data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) | ||||||||||
| if not hasattr(data, 'dtype'): # scalar case | ||||||||||
| data = dtypes.fill_value(value.dtype) if valid_count == 0 else data | ||||||||||
| return np.array(data, dtype=value.dtype) | ||||||||||
| return where_method(data, valid_count != 0) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs): | ||||||||||
| """ In house nanargmin, nanargmax for object arrays. Always return integer | ||||||||||
| type """ | ||||||||||
| if fill_value_typ == '+inf': | ||||||||||
| fill_value = dtypes.get_pos_infinity(value.dtype) | ||||||||||
| else: | ||||||||||
| fill_value = dtypes.get_neg_infinity(value.dtype) | ||||||||||
| valid_count = count(value, axis=axis) | ||||||||||
| value = fillna(value, fill_value) | ||||||||||
| data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) | ||||||||||
| # dask seems return non-integer type | ||||||||||
| if isinstance(value, dask_array_type): | ||||||||||
| data = data.astype(int) | ||||||||||
|
|
||||||||||
| if (valid_count == 0).any(): | ||||||||||
| raise ValueError('All-NaN slice encountered') | ||||||||||
|
|
||||||||||
| return np.array(data, dtype=int) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): | ||||||||||
| """ In house nanmean. ddof argument will be used in _nanvar method """ | ||||||||||
| valid_count = count(value, axis=axis) | ||||||||||
| value = fillna(value, 0) | ||||||||||
| # As dtype inference is impossible for object dtype, we assume float | ||||||||||
| # https://github.com/dask/dask/issues/3162 | ||||||||||
| dtype = kwargs.pop('dtype', None) | ||||||||||
| if dtype is None and value.dtype.kind == 'O': | ||||||||||
| dtype = value.dtype if value.dtype.kind in ['cf'] else float | ||||||||||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a good workaround to infer the output dtype of the object-typed array?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this fixed by your dask PR dask/dask#3137? If so, can we maybe say this requires using the latest dask release?
Comment on lines
+242
to
+243
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wasn't the intent In any case, this is a no-op because
Suggested change
I am attempting to fix this #10465 (comment). |
||||||||||
|
|
||||||||||
| data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) | ||||||||||
| data = data / (valid_count - ddof) | ||||||||||
| return where_method(data, valid_count != 0) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _nanvar_object(value, axis=None, **kwargs): | ||||||||||
| ddof = kwargs.pop('ddof', 0) | ||||||||||
| kwargs_mean = kwargs.copy() | ||||||||||
| kwargs_mean.pop('keepdims', None) | ||||||||||
| value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, | ||||||||||
| keepdims=True, **kwargs_mean) | ||||||||||
| squared = (value.astype(value_mean.dtype) - value_mean)**2 | ||||||||||
| return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| _nan_object_funcs = { | ||||||||||
| 'sum': _nansum_object, | ||||||||||
| 'min': partial(_nan_minmax_object, 'min', '+inf'), | ||||||||||
| 'max': partial(_nan_minmax_object, 'max', '-inf'), | ||||||||||
| 'argmin': partial(_nan_argminmax_object, 'argmin', '+inf'), | ||||||||||
| 'argmax': partial(_nan_argminmax_object, 'argmax', '-inf'), | ||||||||||
| 'mean': partial(_nanmean_ddof_object, 0), | ||||||||||
| 'var': _nanvar_object, | ||||||||||
| } | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _create_nan_agg_method(name, numeric_only=False, np_compat=False, | ||||||||||
| no_bottleneck=False, coerce_strings=False, | ||||||||||
| keep_dims=False): | ||||||||||
|
|
@@ -185,27 +262,31 @@ def f(values, axis=None, skipna=None, **kwargs): | |||||||||
| if coerce_strings and values.dtype.kind in 'SU': | ||||||||||
| values = values.astype(object) | ||||||||||
|
|
||||||||||
| if skipna or (skipna is None and values.dtype.kind in 'cf'): | ||||||||||
| if skipna or (skipna is None and values.dtype.kind in 'cfO'): | ||||||||||
| if values.dtype.kind not in ['u', 'i', 'f', 'c']: | ||||||||||
| raise NotImplementedError( | ||||||||||
| 'skipna=True not yet implemented for %s with dtype %s' | ||||||||||
| % (name, values.dtype)) | ||||||||||
| nanname = 'nan' + name | ||||||||||
| if (isinstance(axis, tuple) or not values.dtype.isnative or | ||||||||||
| no_bottleneck or | ||||||||||
| (dtype is not None and np.dtype(dtype) != values.dtype)): | ||||||||||
| # bottleneck can't handle multiple axis arguments or non-native | ||||||||||
| # endianness | ||||||||||
| if np_compat: | ||||||||||
| eager_module = npcompat | ||||||||||
| else: | ||||||||||
| eager_module = np | ||||||||||
| func = _nan_object_funcs.get(name, None) | ||||||||||
| using_numpy_nan_func = True | ||||||||||
| if func is None or values.dtype.kind not in 'Ob': | ||||||||||
| raise NotImplementedError( | ||||||||||
| 'skipna=True not yet implemented for %s with dtype %s' | ||||||||||
| % (name, values.dtype)) | ||||||||||
| else: | ||||||||||
| kwargs.pop('dtype', None) | ||||||||||
| eager_module = bn | ||||||||||
| func = _dask_or_eager_func(nanname, eager_module) | ||||||||||
| using_numpy_nan_func = (eager_module is np or | ||||||||||
| eager_module is npcompat) | ||||||||||
| nanname = 'nan' + name | ||||||||||
| if (isinstance(axis, tuple) or not values.dtype.isnative or | ||||||||||
| no_bottleneck or (dtype is not None and | ||||||||||
| np.dtype(dtype) != values.dtype)): | ||||||||||
| # bottleneck can't handle multiple axis arguments or | ||||||||||
| # non-native endianness | ||||||||||
| if np_compat: | ||||||||||
| eager_module = npcompat | ||||||||||
| else: | ||||||||||
| eager_module = np | ||||||||||
| else: | ||||||||||
| kwargs.pop('dtype', None) | ||||||||||
| eager_module = bn | ||||||||||
| func = _dask_or_eager_func(nanname, eager_module) | ||||||||||
| using_numpy_nan_func = (eager_module is np or | ||||||||||
| eager_module is npcompat) | ||||||||||
| else: | ||||||||||
| func = _dask_or_eager_func(name) | ||||||||||
| using_numpy_nan_func = False | ||||||||||
|
|
@@ -214,7 +295,11 @@ def f(values, axis=None, skipna=None, **kwargs): | |||||||||
| return func(values, axis=axis, **kwargs) | ||||||||||
| except AttributeError: | ||||||||||
| if isinstance(values, dask_array_type): | ||||||||||
| msg = '%s is not yet implemented on dask arrays' % name | ||||||||||
| try: # dask/dask#3133 dask sometimes needs dtype argument | ||||||||||
| return func(values, axis=axis, dtype=values.dtype, | ||||||||||
| **kwargs) | ||||||||||
| except AttributeError: | ||||||||||
| msg = '%s is not yet implemented on dask arrays' % name | ||||||||||
| else: | ||||||||||
| assert using_numpy_nan_func | ||||||||||
| msg = ('%s is not available with skipna=False with the ' | ||||||||||
|
|
||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we want:
issubclass(dtype.type, (np.floating, np.integer))->np.infissubclass(dtype.type, np.complexfloating)->np.inf + 1j * np.infUsing
np.inffor integer types should be faster, since it doesn't require comparing everything as objets. And I think we neednp.inf + 1j * np.infto match numpy's sort order for complex values.It's better to use
issubclasswithdtype.typebecausenp.issubdtypehas some weird (deprecated) fallback rules: https://github.com/numpy/numpy/blob/v1.14.0/numpy/core/numerictypes.py#L699-L758