Skip to content

Commit bcec983

Browse files
committed
Merge pull request #3736 from jreback/mask_slow
PERF: speed up where operations when splitting blocks (GH3733)
2 parents dff5e28 + 8e2d490 commit bcec983

File tree

4 files changed

+79
-33
lines changed

4 files changed

+79
-33
lines changed

RELEASE.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ pandas 0.11.1
7676
GH3572_). This happens before any drawing takes place which elimnates any
7777
spurious plots from showing up.
7878
- Added Faq section on repr display options, to help users customize their setup.
79+
- ``where`` operations that result in block splitting are much faster (GH3733_)
7980

8081
**API Changes**
8182

@@ -116,6 +117,8 @@ pandas 0.11.1
116117
- the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
117118
deprecated
118119
- Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_)
120+
- ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned
121+
as an int, maxing with ``int64``, to avoid precision issues (GH3733_)
119122

120123
**Bug Fixes**
121124

@@ -273,6 +276,7 @@ pandas 0.11.1
273276
.. _GH3691: https://github.com/pydata/pandas/issues/3691
274277
.. _GH3696: https://github.com/pydata/pandas/issues/3696
275278
.. _GH3667: https://github.com/pydata/pandas/issues/3667
279+
.. _GH3733: https://github.com/pydata/pandas/issues/3733
276280

277281
pandas 0.11.0
278282
=============

pandas/core/internals.py

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -558,42 +558,38 @@ def func(c,v,o):
558558
result.fill(np.nan)
559559
return result
560560

561-
def create_block(result, items, transpose=True):
561+
# see if we can operate on the entire block, or need item-by-item
562+
result = func(cond,values,other)
563+
if self._can_hold_na:
564+
562565
if not isinstance(result, np.ndarray):
563566
raise TypeError('Could not compare [%s] with block values'
564567
% repr(other))
565568

566-
if transpose and is_transposed:
569+
if is_transposed:
567570
result = result.T
568571

569572
# try to cast if requested
570573
if try_cast:
571574
result = self._try_cast_result(result)
572575

573-
return make_block(result, items, self.ref_items)
576+
return make_block(result, self.items, self.ref_items)
574577

575-
# see if we can operate on the entire block, or need item-by-item
576-
if not self._can_hold_na:
577-
axis = cond.ndim-1
578-
result_blocks = []
579-
for item in self.items:
580-
loc = self.items.get_loc(item)
581-
item = self.items.take([loc])
582-
v = values.take([loc],axis=axis)
583-
c = cond.take([loc],axis=axis)
584-
o = other.take([loc],axis=axis) if hasattr(other,'shape') else other
585-
586-
result = func(c,v,o)
587-
if len(result) == 1:
588-
result = np.repeat(result,self.shape[1:])
589-
590-
result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
591-
result_blocks.append(create_block(result, item, transpose=False))
592-
593-
return result_blocks
594-
else:
595-
result = func(cond,values,other)
596-
return create_block(result, self.items)
578+
# might need to separate out blocks
579+
axis = cond.ndim-1
580+
cond = cond.swapaxes(axis,0)
581+
mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool)
582+
583+
result_blocks = []
584+
for m in [mask, ~mask]:
585+
if m.any():
586+
items = self.items[m]
587+
slices = [slice(None)] * cond.ndim
588+
slices[axis] = self.items.get_indexer(items)
589+
r = self._try_cast_result(result[slices])
590+
result_blocks.append(make_block(r.T, items, self.ref_items))
591+
592+
return result_blocks
597593

598594
class NumericBlock(Block):
599595
is_numeric = True
@@ -2429,7 +2425,22 @@ def _lcd_dtype(l):
24292425
elif have_bool:
24302426
return np.dtype(bool)
24312427
elif have_int and not have_float and not have_complex:
2432-
return _lcd_dtype(counts[IntBlock])
2428+
2429+
# if we are mixing unsigned and signed, then return
2430+
# the next biggest int type (if we can)
2431+
lcd = _lcd_dtype(counts[IntBlock])
2432+
kinds = set([ i.dtype.kind for i in counts[IntBlock] ])
2433+
if len(kinds) == 1:
2434+
return lcd
2435+
2436+
if lcd == 'uint64' or lcd == 'int64':
2437+
return np.dtype('int64')
2438+
2439+
# return 1 bigger on the itemsize if unsinged
2440+
if lcd.kind == 'u':
2441+
return np.dtype('int%s' % (lcd.itemsize*8*2))
2442+
return lcd
2443+
24332444
elif have_dt64 and not have_float and not have_complex:
24342445
return np.dtype('M8[ns]')
24352446
elif have_complex:

pandas/tests/test_frame.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,16 @@ def test_getitem_boolean_casting(self):
271271
expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1})
272272
assert_series_equal(result, expected)
273273

274+
# where dtype conversions
275+
# GH 3733
276+
df = DataFrame(data = np.random.randn(100, 50))
277+
df = df.where(df > 0) # create nans
278+
bools = df > 0
279+
mask = isnull(df)
280+
expected = bools.astype(float).mask(mask)
281+
result = bools.mask(mask)
282+
assert_frame_equal(result,expected)
283+
274284
def test_getitem_boolean_list(self):
275285
df = DataFrame(np.arange(12).reshape(3, 4))
276286

@@ -7568,8 +7578,10 @@ def test_where(self):
75687578

75697579
def _safe_add(df):
75707580
# only add to the numeric items
7571-
return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ]))
7572-
7581+
def is_ok(s):
7582+
return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
7583+
return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))
7584+
75737585
def _check_get(df, cond, check_dtypes = True):
75747586
other1 = _safe_add(df)
75757587
rs = df.where(cond, other1)
@@ -7605,20 +7617,24 @@ def _check_get(df, cond, check_dtypes = True):
76057617
def _check_align(df, cond, other, check_dtypes = True):
76067618
rs = df.where(cond, other)
76077619
for i, k in enumerate(rs.columns):
7608-
v = rs[k]
7620+
result = rs[k]
76097621
d = df[k].values
76107622
c = cond[k].reindex(df[k].index).fillna(False).values
76117623

76127624
if np.isscalar(other):
76137625
o = other
76147626
else:
76157627
if isinstance(other,np.ndarray):
7616-
o = Series(other[:,i],index=v.index).values
7628+
o = Series(other[:,i],index=result.index).values
76177629
else:
76187630
o = other[k].values
76197631

76207632
new_values = d if c.all() else np.where(c, d, o)
7621-
assert_series_equal(v, Series(new_values,index=v.index))
7633+
expected = Series(new_values,index=result.index)
7634+
7635+
# since we can't always have the correct numpy dtype
7636+
# as numpy doesn't know how to downcast, don't check
7637+
assert_series_equal(result, expected, check_dtype=False)
76227638

76237639
# dtypes
76247640
# can't check dtype when other is an ndarray
@@ -9894,14 +9910,14 @@ def test_as_matrix_lcd(self):
98949910
self.assert_(values.dtype == np.float16)
98959911

98969912
values = self.mixed_int.as_matrix(['A','B','C','D'])
9897-
self.assert_(values.dtype == np.uint64)
9913+
self.assert_(values.dtype == np.int64)
98989914

98999915
values = self.mixed_int.as_matrix(['A','D'])
99009916
self.assert_(values.dtype == np.int64)
99019917

99029918
# guess all ints are cast to uints....
99039919
values = self.mixed_int.as_matrix(['A','B','C'])
9904-
self.assert_(values.dtype == np.uint64)
9920+
self.assert_(values.dtype == np.int64)
99059921

99069922
values = self.mixed_int.as_matrix(['A','C'])
99079923
self.assert_(values.dtype == np.int32)

vb_suite/frame_methods.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,18 @@ def f(K=500):
177177
"""
178178

179179
frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup)
180+
181+
## masking
182+
setup = common_setup + """
183+
data = np.random.randn(1000, 500)
184+
df = DataFrame(data)
185+
df = df.where(df > 0) # create nans
186+
bools = df > 0
187+
mask = isnull(df)
188+
"""
189+
190+
mask_bools = Benchmark('bools.mask(mask)', setup,
191+
start_date=datetime(2013,1,1))
192+
193+
mask_floats = Benchmark('bools.astype(float).mask(mask)', setup,
194+
start_date=datetime(2013,1,1))

0 commit comments

Comments
 (0)