1- from .pandas_vb_common import *
2- from random import shuffle
1+ import numpy as np
2+ import pandas .util .testing as tm
3+ from pandas import (DataFrame , Series , DatetimeIndex , MultiIndex , Index ,
4+ date_range )
5+ from .pandas_vb_common import setup , lib # noqa
36
47
5- class Reindexing (object ):
8+ class Reindex (object ):
9+
610 goal_time = 0.2
711
812 def setup (self ):
9- self . rng = DatetimeIndex (start = '1/1/1970' , periods = 10000 , freq = '1min' )
10- self .df = DataFrame (np .random .rand (10000 , 10 ), index = self . rng ,
13+ rng = DatetimeIndex (start = '1/1/1970' , periods = 10000 , freq = '1min' )
14+ self .df = DataFrame (np .random .rand (10000 , 10 ), index = rng ,
1115 columns = range (10 ))
1216 self .df ['foo' ] = 'bar'
13- self .rng2 = Index (self .rng [::2 ])
14-
17+ self .rng_subset = Index (rng [::2 ])
1518 self .df2 = DataFrame (index = range (10000 ),
1619 data = np .random .rand (10000 , 30 ), columns = range (30 ))
17-
18- # multi-index
1920 N = 5000
2021 K = 200
2122 level1 = tm .makeStringIndex (N ).values .repeat (K )
2223 level2 = np .tile (tm .makeStringIndex (K ).values , N )
2324 index = MultiIndex .from_arrays ([level1 , level2 ])
24- self .s1 = Series (np .random .randn (( N * K ) ), index = index )
25- self .s2 = self .s1 [::2 ]
25+ self .s = Series (np .random .randn (N * K ), index = index )
26+ self .s_subset = self .s [::2 ]
2627
2728 def time_reindex_dates (self ):
28- self .df .reindex (self .rng2 )
29+ self .df .reindex (self .rng_subset )
2930
3031 def time_reindex_columns (self ):
3132 self .df2 .reindex (columns = self .df .columns [1 :5 ])
3233
3334 def time_reindex_multiindex (self ):
34- self .s1 .reindex (self .s2 .index )
35+ self .s .reindex (self .s_subset .index )
3536
3637
37- #----------------------------------------------------------------------
38- # Pad / backfill
38+ class ReindexMethod (object ):
3939
40-
41- class FillMethod (object ):
4240 goal_time = 0.2
41+ params = ['pad' , 'backfill' ]
42+ param_names = ['method' ]
4343
44- def setup (self ):
45- self .rng = date_range ('1/1/2000' , periods = 100000 , freq = '1min' )
46- self .ts = Series (np .random .randn (len (self .rng )), index = self .rng )
47- self .ts2 = self .ts [::2 ]
48- self .ts3 = self .ts2 .reindex (self .ts .index )
49- self .ts4 = self .ts3 .astype ('float32' )
50-
51- def pad (self , source_series , target_index ):
52- try :
53- source_series .reindex (target_index , method = 'pad' )
54- except :
55- source_series .reindex (target_index , fillMethod = 'pad' )
56-
57- def backfill (self , source_series , target_index ):
58- try :
59- source_series .reindex (target_index , method = 'backfill' )
60- except :
61- source_series .reindex (target_index , fillMethod = 'backfill' )
62-
63- def time_backfill_dates (self ):
64- self .backfill (self .ts2 , self .ts .index )
44+ def setup (self , method ):
45+ N = 100000
46+ self .idx = date_range ('1/1/2000' , periods = N , freq = '1min' )
47+ self .ts = Series (np .random .randn (N ), index = self .idx )[::2 ]
6548
66- def time_pad_daterange (self ):
67- self .pad (self .ts2 , self . ts . index )
49+ def time_reindex_method (self , method ):
50+ self .ts . reindex (self .idx , method = method )
6851
69- def time_backfill (self ):
70- self .ts3 .fillna (method = 'backfill' )
7152
72- def time_backfill_float32 (self ):
73- self .ts4 .fillna (method = 'backfill' )
53+ class Fillna (object ):
7454
75- def time_pad (self ):
76- self .ts3 .fillna (method = 'pad' )
55+ goal_time = 0.2
56+ params = ['pad' , 'backfill' ]
57+ param_names = ['method' ]
7758
78- def time_pad_float32 (self ):
79- self .ts4 .fillna (method = 'pad' )
59+ def setup (self , method ):
60+ N = 100000
61+ self .idx = date_range ('1/1/2000' , periods = N , freq = '1min' )
62+ ts = Series (np .random .randn (N ), index = self .idx )[::2 ]
63+ self .ts_reindexed = ts .reindex (self .idx )
64+ self .ts_float32 = self .ts_reindexed .astype ('float32' )
8065
66+ def time_reindexed (self , method ):
67+ self .ts_reindexed .fillna (method = method )
8168
82- #----------------------------------------------------------------------
83- # align on level
69+ def time_float_32 ( self , method ):
70+ self . ts_float32 . fillna ( method = method )
8471
8572
8673class LevelAlign (object ):
74+
8775 goal_time = 0.2
8876
8977 def setup (self ):
@@ -92,7 +80,6 @@ def setup(self):
9280 labels = [np .arange (10 ).repeat (10000 ),
9381 np .tile (np .arange (100 ).repeat (100 ), 10 ),
9482 np .tile (np .tile (np .arange (100 ), 100 ), 10 )])
95- random .shuffle (self .index .values )
9683 self .df = DataFrame (np .random .randn (len (self .index ), 4 ),
9784 index = self .index )
9885 self .df_level = DataFrame (np .random .randn (100 , 4 ),
@@ -102,103 +89,85 @@ def time_align_level(self):
10289 self .df .align (self .df_level , level = 1 , copy = False )
10390
10491 def time_reindex_level (self ):
105- self .df_level .reindex (self .df . index , level = 1 )
92+ self .df_level .reindex (self .index , level = 1 )
10693
10794
108- #----------------------------------------------------------------------
109- # drop_duplicates
95+ class DropDuplicates (object ):
11096
111-
112- class Duplicates (object ):
11397 goal_time = 0.2
114-
115- def setup ( self ):
116- self . N = 10000
117- self . K = 10
118- self . key1 = tm . makeStringIndex ( self . N ). values . repeat ( self . K )
119- self . key2 = tm . makeStringIndex ( self . N ). values . repeat ( self . K )
120- self . df = DataFrame ({ 'key1' : self . key1 , 'key2' : self . key2 ,
121- 'value' : np . random . randn (( self . N * self . K )),} )
122- self .col_array_list = list ( self . df . values . T )
123-
124- self .df2 = self .df .copy ()
125- self .df2 . ix [:10000 , :] = np .nan
98+ params = [ True , False ]
99+ param_names = [ 'inplace' ]
100+
101+ def setup ( self , inplace ):
102+ N = 10000
103+ K = 10
104+ key1 = tm . makeStringIndex ( N ). values . repeat ( K )
105+ key2 = tm . makeStringIndex ( N ). values . repeat ( K )
106+ self .df = DataFrame ({ 'key1' : key1 , 'key2' : key2 ,
107+ 'value' : np . random . randn ( N * K )})
108+ self .df_nan = self .df .copy ()
109+ self .df_nan . iloc [:10000 , :] = np .nan
126110
127111 self .s = Series (np .random .randint (0 , 1000 , size = 10000 ))
128- self .s2 = Series (np .tile (tm .makeStringIndex (1000 ).values , 10 ))
129-
130- np .random .seed (1234 )
131- self .N = 1000000
132- self .K = 10000
133- self .key1 = np .random .randint (0 , self .K , size = self .N )
134- self .df_int = DataFrame ({'key1' : self .key1 })
135- self .df_bool = DataFrame ({i : np .random .randint (0 , 2 , size = self .K ,
136- dtype = bool )
137- for i in range (10 )})
112+ self .s_str = Series (np .tile (tm .makeStringIndex (1000 ).values , 10 ))
138113
139- def time_frame_drop_dups (self ):
140- self .df .drop_duplicates (['key1' , 'key2' ])
114+ N = 1000000
115+ K = 10000
116+ key1 = np .random .randint (0 , K , size = N )
117+ self .df_int = DataFrame ({'key1' : key1 })
118+ self .df_bool = DataFrame (np .random .randint (0 , 2 , size = (K , 10 ),
119+ dtype = bool ))
141120
142- def time_frame_drop_dups_inplace (self ):
143- self .df .drop_duplicates (['key1' , 'key2' ], inplace = True )
121+ def time_frame_drop_dups (self , inplace ):
122+ self .df .drop_duplicates (['key1' , 'key2' ], inplace = inplace )
144123
145- def time_frame_drop_dups_na (self ):
146- self .df2 .drop_duplicates (['key1' , 'key2' ])
124+ def time_frame_drop_dups_na (self , inplace ):
125+ self .df_nan .drop_duplicates (['key1' , 'key2' ], inplace = inplace )
147126
148- def time_frame_drop_dups_na_inplace (self ):
149- self .df2 .drop_duplicates ([ 'key1' , 'key2' ], inplace = True )
127+ def time_series_drop_dups_int (self , inplace ):
128+ self .s .drop_duplicates (inplace = inplace )
150129
151- def time_series_drop_dups_int (self ):
152- self .s .drop_duplicates ()
130+ def time_series_drop_dups_string (self , inplace ):
131+ self .s_str .drop_duplicates (inplace = inplace )
153132
154- def time_series_drop_dups_string (self ):
155- self .s2 .drop_duplicates ()
133+ def time_frame_drop_dups_int (self , inplace ):
134+ self .df_int .drop_duplicates (inplace = inplace )
156135
157- def time_frame_drop_dups_int (self ):
158- self .df_int .drop_duplicates ()
159-
160- def time_frame_drop_dups_bool (self ):
161- self .df_bool .drop_duplicates ()
162-
163- #----------------------------------------------------------------------
164- # blog "pandas escaped the zoo"
136+ def time_frame_drop_dups_bool (self , inplace ):
137+ self .df_bool .drop_duplicates (inplace = inplace )
165138
166139
167140class Align (object ):
141+ # blog "pandas escaped the zoo"
168142 goal_time = 0.2
169143
170144 def setup (self ):
171145 n = 50000
172146 indices = tm .makeStringIndex (n )
173147 subsample_size = 40000
174-
175- def sample (values , k ):
176- sampler = np .arange (len (values ))
177- shuffle (sampler )
178- return values .take (sampler [:k ])
179-
180- self .x = Series (np .random .randn (50000 ), indices )
148+ self .x = Series (np .random .randn (n ), indices )
181149 self .y = Series (np .random .randn (subsample_size ),
182- index = sample (indices , subsample_size ))
150+ index = np .random .choice (indices , subsample_size ,
151+ replace = False ))
183152
184153 def time_align_series_irregular_string (self ):
185- ( self .x + self .y )
154+ self .x + self .y
186155
187156
188157class LibFastZip (object ):
158+
189159 goal_time = 0.2
190160
191161 def setup (self ):
192- self .N = 10000
193- self .K = 10
194- self .key1 = tm .makeStringIndex (self .N ).values .repeat (self .K )
195- self .key2 = tm .makeStringIndex (self .N ).values .repeat (self .K )
196- self .df = DataFrame ({'key1' : self .key1 , 'key2' : self .key2 , 'value' : np .random .randn ((self .N * self .K )), })
197- self .col_array_list = list (self .df .values .T )
198-
199- self .df2 = self .df .copy ()
200- self .df2 .ix [:10000 , :] = np .nan
201- self .col_array_list2 = list (self .df2 .values .T )
162+ N = 10000
163+ K = 10
164+ key1 = tm .makeStringIndex (N ).values .repeat (K )
165+ key2 = tm .makeStringIndex (N ).values .repeat (K )
166+ col_array = np .vstack ([key1 , key2 , np .random .randn (N * K )])
167+ col_array2 = col_array .copy ()
168+ col_array2 [:, :10000 ] = np .nan
169+ self .col_array_list = list (col_array )
170+ self .col_array_list2 = list (col_array2 )
202171
203172 def time_lib_fast_zip (self ):
204173 lib .fast_zip (self .col_array_list )
0 commit comments