pydata · fujiisoup · Feb 15, 2018 · Feb 2, 2018 · Feb 2, 2018 · Feb 2, 2018
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -43,7 +43,16 @@ Documentation
 
 Enhancements
 ~~~~~~~~~~~~
-- reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype``
+- Reduce methods such as :py:func:`DataArray.sum()` now handles object-type array.
+
+  .. ipython:: python
+
+    da = xr.DataArray(np.array([True, False, np.nan], dtype=object), dims='x')
+    da.sum()
+
+  (:issue:`1866`)
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
+- Reduce methods such as :py:func:`DataArray.sum()` now accepts ``dtype``
   arguments. (:issue:`1838`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 - Added nodatavals attribute to DataArray when using :py:func:`~xarray.open_rasterio`. (:issue:`1736`).

diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
@@ -1,4 +1,5 @@
 import numpy as np
+import functools
 
 from . import utils
 
@@ -7,6 +8,29 @@
 NA = utils.ReprObject('<NA>')
 
 
+@functools.total_ordering
+class AlwaysGreaterThan(object):
+    def __gt__(self, other):
+        return True
+
+    def __eq__(self, other):
+        return isinstance(other, type(self))
+
+
+@functools.total_ordering
+class AlwaysLessThan(object):
+    def __lt__(self, other):
+        return True
+
+    def __eq__(self, other):
+        return isinstance(other, type(self))
+
+
+# Equivalence to np.inf (-np.inf) for object-type
+INF = AlwaysGreaterThan()
+NINF = AlwaysLessThan()
+
+
 def maybe_promote(dtype):
     """Simpler equivalent of pandas.core.common._maybe_promote
 
@@ -55,6 +79,40 @@ def get_fill_value(dtype):
     return fill_value
 
 
+def get_pos_infinity(dtype):
+    """Return an appropriate positive infinity for this dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+
+    Returns
+    -------
+    fill_value : positive infinity value corresponding to this dtype.
+    """
+    if np.issubdtype(dtype, np.floating):
+        return np.inf
+
+    return INF
+
+
+def get_neg_infinity(dtype):
+    """Return an appropriate positive infinity for this dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+
+    Returns
+    -------
+    fill_value : positive infinity value corresponding to this dtype.
+    """
+    if np.issubdtype(dtype, np.floating):
+        return -np.inf
+
+    return NINF
+
+
 def is_datetime_like(dtype):
     """Check if a dtype is a subclass of the numpy datetime types
     """

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -171,6 +171,83 @@ def _ignore_warnings_if(condition):
         yield
 
 
+def _nansum_object(value, axis=None, **kwargs):
+    """ In house nansum for object array """
+    value = fillna(value, 0)
+    return _dask_or_eager_func('sum')(value, axis=axis, **kwargs)
+
+
+def _nan_minmax_object(func, fill_value_typ, value, axis=None, **kwargs):
+    """ In house nanmin and nanmax for object array """
+    if fill_value_typ == '+inf':
+        fill_value = dtypes.get_pos_infinity(value.dtype)
+    else:
+        fill_value = dtypes.get_neg_infinity(value.dtype)
+    valid_count = count(value, axis=axis)
+    filled_value = fillna(value, fill_value)
+    data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs)
+    if not hasattr(data, 'dtype'):  # scalar case
+        data = dtypes.fill_value(value.dtype) if valid_count == 0 else data
+        return np.array(data, dtype=value.dtype)
+    return where_method(data, valid_count != 0)
+
+
+def _nan_argminmax_object(func, fill_value_typ, value, axis=None, **kwargs):
+    """ In house nanargmin, nanargmax for object arrays. Always return integer
+    type """
+    if fill_value_typ == '+inf':
+        fill_value = dtypes.get_pos_infinity(value.dtype)
+    else:
+        fill_value = dtypes.get_neg_infinity(value.dtype)
+    valid_count = count(value, axis=axis)
+    value = fillna(value, fill_value)
+    data = _dask_or_eager_func(func)(value, axis=axis, **kwargs)
+    # dask seems return non-integer type
+    if isinstance(value, dask_array_type):
+        data = data.astype(int)
+
+    if (valid_count == 0).any():
+        raise ValueError('All-NaN slice encountered')
+
+    return np.array(data, dtype=int)
+
+
+def _nanmean_ddof_object(ddof, value, axis=None, **kwargs):
+    """ In house nanmean. ddof argument will be used in _nanvar method """
+    valid_count = count(value, axis=axis)
+    value = fillna(value, 0)
+    # As dtype inference is impossible for object dtype, we assume float
+    # https://github.com/dask/dask/issues/3162
+    dtype = kwargs.pop('dtype', None)
+    if dtype is None and value.dtype.kind == 'O':
+        dtype = value.dtype if value.dtype.kind in ['cf'] else float
-    if dtype is None and value.dtype.kind == 'O':
-        dtype = value.dtype if value.dtype.kind in ['cf'] else float
+    if dtype is None and value.dtype.kind == 'O':
+        dtype =  float
-    if dtype is None and value.dtype.kind == 'O':
-        dtype = value.dtype if value.dtype.kind in ['cf'] else float
+    if dtype is None and value.dtype.kind == 'O':
+        dtype =  float
+
+    data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs)
+    data = data / (valid_count - ddof)
+    return where_method(data, valid_count != 0)
+
+
+def _nanvar_object(value, axis=None, **kwargs):
+    ddof = kwargs.pop('ddof', 0)
+    kwargs_mean = kwargs.copy()
+    kwargs_mean.pop('keepdims', None)
+    value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis,
+                                      keepdims=True, **kwargs_mean)
+    squared = (value.astype(value_mean.dtype) - value_mean)**2
+    return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs)
+
+
+_nan_object_funcs = {
+    'sum': _nansum_object,
+    'min': partial(_nan_minmax_object, 'min', '+inf'),
+    'max': partial(_nan_minmax_object, 'max', '-inf'),
+    'argmin': partial(_nan_argminmax_object, 'argmin', '+inf'),
+    'argmax': partial(_nan_argminmax_object, 'argmax', '-inf'),
+    'mean': partial(_nanmean_ddof_object, 0),
+    'var': _nanvar_object,
+}
+
+
 def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
                            no_bottleneck=False, coerce_strings=False,
                            keep_dims=False):
@@ -185,27 +262,31 @@ def f(values, axis=None, skipna=None, **kwargs):
         if coerce_strings and values.dtype.kind in 'SU':
             values = values.astype(object)
 
-        if skipna or (skipna is None and values.dtype.kind in 'cf'):
+        if skipna or (skipna is None and values.dtype.kind in 'cfO'):
             if values.dtype.kind not in ['u', 'i', 'f', 'c']:
-                raise NotImplementedError(
-                    'skipna=True not yet implemented for %s with dtype %s'
-                    % (name, values.dtype))
-            nanname = 'nan' + name
-            if (isinstance(axis, tuple) or not values.dtype.isnative or
-                    no_bottleneck or
-                    (dtype is not None and np.dtype(dtype) != values.dtype)):
-                # bottleneck can't handle multiple axis arguments or non-native
-                # endianness
-                if np_compat:
-                    eager_module = npcompat
-                else:
-                    eager_module = np
+                func = _nan_object_funcs.get(name, None)
+                using_numpy_nan_func = True
+                if func is None or values.dtype.kind not in 'Ob':
+                    raise NotImplementedError(
+                        'skipna=True not yet implemented for %s with dtype %s'
+                        % (name, values.dtype))
             else:
-                kwargs.pop('dtype', None)
-                eager_module = bn
-            func = _dask_or_eager_func(nanname, eager_module)
-            using_numpy_nan_func = (eager_module is np or
-                                    eager_module is npcompat)
+                nanname = 'nan' + name
+                if (isinstance(axis, tuple) or not values.dtype.isnative or
+                        no_bottleneck or (dtype is not None and
+                                          np.dtype(dtype) != values.dtype)):
+                    # bottleneck can't handle multiple axis arguments or
+                    # non-native endianness
+                    if np_compat:
+                        eager_module = npcompat
+                    else:
+                        eager_module = np
+                else:
+                    kwargs.pop('dtype', None)
+                    eager_module = bn
+                func = _dask_or_eager_func(nanname, eager_module)
+                using_numpy_nan_func = (eager_module is np or
+                                        eager_module is npcompat)
         else:
             func = _dask_or_eager_func(name)
             using_numpy_nan_func = False
@@ -214,7 +295,11 @@ def f(values, axis=None, skipna=None, **kwargs):
                 return func(values, axis=axis, **kwargs)
             except AttributeError:
                 if isinstance(values, dask_array_type):
-                    msg = '%s is not yet implemented on dask arrays' % name
+                    try:  # dask/dask#3133 dask sometimes needs dtype argument
+                        return func(values, axis=axis, dtype=values.dtype,
+                                    **kwargs)
+                    except AttributeError:
+                        msg = '%s is not yet implemented on dask arrays' % name
                 else:
                     assert using_numpy_nan_func
                     msg = ('%s is not available with skipna=False with the '