Merge pull request #3736 from jreback/mask_slow

jreback · jreback · commit bcec983ca2d9 · 2013-06-02T04:58:39.000-07:00
PERF: speed up where operations when splitting blocks (GH3733)
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -76,6 +76,7 @@ pandas 0.11.1
     GH3572_). This happens before any drawing takes place which elimnates any
     spurious plots from showing up.
   - Added Faq section on repr display options, to help users customize their setup.
+  - ``where`` operations that result in block splitting are much faster (GH3733_)
 
 **API Changes**
 
@@ -116,6 +117,8 @@ pandas 0.11.1
   - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
     deprecated
   - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_)
+  - ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned
+    as an int, maxing with ``int64``, to avoid precision issues (GH3733_)
 
 **Bug Fixes**
 
@@ -273,6 +276,7 @@ pandas 0.11.1
 .. _GH3691: https://github.com/pydata/pandas/issues/3691
 .. _GH3696: https://github.com/pydata/pandas/issues/3696
 .. _GH3667: https://github.com/pydata/pandas/issues/3667
+.. _GH3733: https://github.com/pydata/pandas/issues/3733
 
 pandas 0.11.0
 =============
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -558,42 +558,38 @@ def func(c,v,o):
                     result.fill(np.nan)
                     return result
 
-        def create_block(result, items, transpose=True):
+        # see if we can operate on the entire block, or need item-by-item
+        result = func(cond,values,other)
+        if self._can_hold_na:
+
             if not isinstance(result, np.ndarray):
                 raise TypeError('Could not compare [%s] with block values'
                                 % repr(other))
 
-            if transpose and is_transposed:
+            if is_transposed:
                 result = result.T
 
             # try to cast if requested
             if try_cast:
                 result = self._try_cast_result(result)
 
-            return make_block(result, items, self.ref_items)
+            return make_block(result, self.items, self.ref_items)
 
-        # see if we can operate on the entire block, or need item-by-item
-        if not self._can_hold_na:
-            axis = cond.ndim-1
-            result_blocks = []
-            for item in self.items:
-                loc  = self.items.get_loc(item)
-                item = self.items.take([loc])
-                v    = values.take([loc],axis=axis)
-                c    = cond.take([loc],axis=axis)
-                o    = other.take([loc],axis=axis) if hasattr(other,'shape') else other
-
-                result = func(c,v,o)
-                if len(result) == 1:
-                    result = np.repeat(result,self.shape[1:])
-
-                result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
-                result_blocks.append(create_block(result, item, transpose=False))
-
-            return result_blocks
-        else:
-            result = func(cond,values,other)
-            return create_block(result, self.items)
+        # might need to separate out blocks
+        axis = cond.ndim-1
+        cond = cond.swapaxes(axis,0)
+        mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool)
+
+        result_blocks = []
+        for m in [mask, ~mask]:
+            if m.any():
+                items = self.items[m]
+                slices = [slice(None)] * cond.ndim
+                slices[axis] = self.items.get_indexer(items)
+                r = self._try_cast_result(result[slices])
+                result_blocks.append(make_block(r.T, items, self.ref_items))
+
+        return result_blocks
 
 class NumericBlock(Block):
     is_numeric = True
@@ -2429,7 +2425,22 @@ def _lcd_dtype(l):
     elif have_bool:
         return np.dtype(bool)
     elif have_int and not have_float and not have_complex:
-        return _lcd_dtype(counts[IntBlock])
+
+        # if we are mixing unsigned and signed, then return
+        # the next biggest int type (if we can)
+        lcd = _lcd_dtype(counts[IntBlock])
+        kinds = set([ i.dtype.kind for i in counts[IntBlock] ])
+        if len(kinds) == 1:
+            return lcd
+
+        if lcd == 'uint64' or lcd == 'int64':
+            return np.dtype('int64')
+
+        # return 1 bigger on the itemsize if unsinged
+        if lcd.kind == 'u':
+            return np.dtype('int%s' % (lcd.itemsize*8*2))
+        return lcd
+    
     elif have_dt64 and not have_float and not have_complex:
         return np.dtype('M8[ns]')
     elif have_complex:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -271,6 +271,16 @@ def test_getitem_boolean_casting(self):
         expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1})
         assert_series_equal(result, expected)
 
+        # where dtype conversions
+        # GH 3733
+        df = DataFrame(data = np.random.randn(100, 50))
+        df = df.where(df > 0) # create nans
+        bools = df > 0
+        mask = isnull(df)
+        expected = bools.astype(float).mask(mask)
+        result = bools.mask(mask)
+        assert_frame_equal(result,expected)
+
     def test_getitem_boolean_list(self):
         df = DataFrame(np.arange(12).reshape(3, 4))
 
@@ -7568,8 +7578,10 @@ def test_where(self):
 
         def _safe_add(df):
             # only add to the numeric items
-            return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ]))
-
+            def is_ok(s):
+                return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
+            return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))
+        
         def _check_get(df, cond, check_dtypes = True):
             other1 = _safe_add(df)
             rs = df.where(cond, other1)
@@ -7605,20 +7617,24 @@ def _check_get(df, cond, check_dtypes = True):
         def _check_align(df, cond, other, check_dtypes = True):
             rs = df.where(cond, other)
             for i, k in enumerate(rs.columns):
-                v = rs[k]
+                result = rs[k]
                 d = df[k].values
                 c = cond[k].reindex(df[k].index).fillna(False).values
 
                 if np.isscalar(other):
                     o = other
                 else:
                     if isinstance(other,np.ndarray):
-                        o = Series(other[:,i],index=v.index).values
+                        o = Series(other[:,i],index=result.index).values
                     else:
                         o = other[k].values
 
                 new_values = d if c.all() else np.where(c, d, o)
-                assert_series_equal(v, Series(new_values,index=v.index))
+                expected = Series(new_values,index=result.index)
+
+                # since we can't always have the correct numpy dtype
+                # as numpy doesn't know how to downcast, don't check
+                assert_series_equal(result, expected, check_dtype=False)
 
             # dtypes
             # can't check dtype when other is an ndarray
@@ -9894,14 +9910,14 @@ def test_as_matrix_lcd(self):
         self.assert_(values.dtype == np.float16)
 
         values = self.mixed_int.as_matrix(['A','B','C','D'])
-        self.assert_(values.dtype == np.uint64)
+        self.assert_(values.dtype == np.int64)
 
         values = self.mixed_int.as_matrix(['A','D'])
         self.assert_(values.dtype == np.int64)
 
         # guess all ints are cast to uints....
         values = self.mixed_int.as_matrix(['A','B','C'])
-        self.assert_(values.dtype == np.uint64)
+        self.assert_(values.dtype == np.int64)
 
         values = self.mixed_int.as_matrix(['A','C'])
         self.assert_(values.dtype == np.int32)
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
@@ -177,3 +177,18 @@ def f(K=500):
 """
 
 frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup)
+
+## masking
+setup = common_setup + """
+data = np.random.randn(1000, 500)
+df = DataFrame(data)
+df = df.where(df > 0) # create nans
+bools = df > 0
+mask = isnull(df)
+"""
+
+mask_bools = Benchmark('bools.mask(mask)', setup,
+                         start_date=datetime(2013,1,1))
+
+mask_floats  = Benchmark('bools.astype(float).mask(mask)', setup,
+                         start_date=datetime(2013,1,1))