Skip to content

Commit

Permalink
Merge pull request #12 from mkraemerx/master
Browse files Browse the repository at this point in the history
Fixing some of the compatibility issues from pandas 0.23
  • Loading branch information
Mourad authored May 29, 2018
2 parents b35beaa + 0178ca8 commit d9f785a
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 27 deletions.
39 changes: 24 additions & 15 deletions pandas_summary/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __getitem__(self, column):
return self.df[list(column)].values

if isinstance(column, pd.Index):
error_keys = [k for k in column.values if not self._clean_column(k)]
error_keys = [
k for k in column.values if not self._clean_column(k)]
if len(error_keys) > 0:
raise KeyError(', '.join(error_keys))
return self.df[column].values
Expand All @@ -60,7 +61,7 @@ def columns_types(self):
return pd.value_counts(self.columns_stats.loc['types'])

def summary(self):
return pd.concat([self.df.describe(), self.columns_stats])[self.df.columns]
return pd.concat([self.df.describe(), self.columns_stats], sort=True)[self.df.columns]

@staticmethod
def _number_format(x):
Expand All @@ -83,7 +84,7 @@ def _get_stats(self):
counts.name = 'counts'
uniques = self._get_uniques()
missing = self._get_missing(counts)
stats = pd.concat([counts, uniques, missing], axis=1)
stats = pd.concat([counts, uniques, missing], axis=1, sort=True)

# settings types
stats['types'] = ''
Expand All @@ -100,7 +101,7 @@ def _get_missing(self, counts):
count.name = 'missing'
perc = (count / self.length).apply(self._percent)
perc.name = 'missing_perc'
return pd.concat([count, perc], axis=1)
return pd.concat([count, perc], axis=1, sort=True)

def _get_columns_info(self, stats):
column_info = {}
Expand All @@ -111,24 +112,28 @@ def _get_columns_info(self, stats):
column_info['constant'].union(column_info['bool']))
column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
if types.is_numeric_dtype(self.df[c])])
rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
rest_columns = self.get_columns(
self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
if types.is_datetime64_dtype(self.df[c])])
rest_columns = self.get_columns(self.df[rest_columns], self.EXCLUDE, column_info['date'])
rest_columns = self.get_columns(
self.df[rest_columns], self.EXCLUDE, column_info['date'])
unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
return column_info

""" Column summaries """

def _get_deviation_of_mean(self, series, multiplier=3):
"""
Returns count of values deviating of the mean, i.e. larger than `multiplier` * `std`.
:type series:
:param multiplier:
:return:
"""
capped_series = np.minimum(series, series.mean() + multiplier * series.std())
capped_series = np.minimum(
series, series.mean() + multiplier * series.std())
count = pd.value_counts(series != capped_series)
count = count[True] if True in count else 0
perc = self._percent(count / self.length)
Expand All @@ -141,7 +146,8 @@ def _get_median_absolute_deviation(self, series, multiplier=3):
:param multiplier:
:return (array):
"""
capped_series = np.minimum(series, series.median() + multiplier * series.mad())
capped_series = np.minimum(
series, series.median() + multiplier * series.mad())
count = pd.value_counts(series != capped_series)
count = count[True] if True in count else 0
perc = self._percent(count / self.length)
Expand Down Expand Up @@ -181,20 +187,22 @@ def _get_numeric_summary(self, column, plot=True):
stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] else np.nan
stats['zeros_num'] = self.length - np.count_nonzero(series)
stats['zeros_perc'] = self._percent(stats['zeros_num'] / self.length)
deviation_of_mean, deviation_of_mean_perc = self._get_deviation_of_mean(series)
deviation_of_mean, deviation_of_mean_perc = self._get_deviation_of_mean(
series)
stats['deviating_of_mean'] = deviation_of_mean
stats['deviating_of_mean_perc'] = deviation_of_mean_perc
deviating_of_median, deviating_of_median_perc = self._get_median_absolute_deviation(series)
deviating_of_median, deviating_of_median_perc = self._get_median_absolute_deviation(
series)
stats['deviating_of_median'] = deviating_of_median
stats['deviating_of_median_perc'] = deviating_of_median_perc
stats['top_correlations'] = self._get_top_correlations(column)
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]])
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]], sort=True)

def _get_date_summary(self, column):
series = self.df[column]
stats = {'min': series.min(), 'max': series.max()}
stats['range'] = stats['max'] - stats['min']
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]])
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]], sort=True)

def _get_categorical_summary(self, column):
series = self.df[column]
Expand All @@ -203,7 +211,7 @@ def _get_categorical_summary(self, column):
stats = {
'top': '{}: {}'.format(value_counts.index[0], value_counts.iloc[0]),
}
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]])
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]], sort=True)

def _get_constant_summary(self, column):
return 'This is a constant value: {}'.format(self.df[column][0])
Expand All @@ -217,7 +225,7 @@ def _get_bool_summary(self, column):
stats['"{}" perc'.format(class_name)] = '{}'.format(
self._percent(class_value / self.length))

return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]])
return pd.concat([pd.Series(stats, name=column), self.columns_stats.ix[:, column]], sort=True)

def _get_unique_summary(self, column):
return self.columns_stats.ix[:, column]
Expand Down Expand Up @@ -256,7 +264,8 @@ def get_columns(self, df, usage, columns=None):

if usage == self.INCLUDE:
try:
columns_included = columns_included.intersection(pd.Index(columns))
columns_included = columns_included.intersection(
pd.Index(columns))
except TypeError:
pass
elif usage == self.EXCLUDE:
Expand Down
31 changes: 19 additions & 12 deletions tests/test_dataframesummary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class DataFrameSummaryTest(unittest.TestCase):

def setUp(self):
self.size = 1000
missing = [np.nan] * (self.size // 10) + list(range(10)) * ((self.size - self.size // 10) // 10)
missing = [np.nan] * (self.size // 10) + list(range(10)) * \
((self.size - self.size // 10) // 10)
shuffle(missing)

self.types = [DataFrameSummary.TYPE_NUMERIC, DataFrameSummary.TYPE_BOOL,
Expand All @@ -33,7 +34,8 @@ def setUp(self):
'c'.format(i) for i in range(self.size)],
dnumerics1=range(self.size),
dnumerics2=range(self.size, 2 * self.size),
dnumerics3=list(range(self.size - self.size // 10)) + list(range(-self.size // 10, 0)),
dnumerics3=list(range(self.size - self.size // 10)
) + list(range(-self.size // 10, 0)),
dmissing=missing,
dconstant=['a'] * self.size,
ddates=pd.date_range('2010-01-01', periods=self.size, freq='1M')))
Expand All @@ -52,8 +54,10 @@ def test_get_columns_works_as_expected(self):
['dnumerics1', 'dnumerics2', 'dnumerics3'])) == 7

def test_column_types_works_as_expected(self):
expected = pd.Series(index=self.types, data=[4, 2, 1, 1, 1, 1], name='types')
assert_series_equal(self.dfs.columns_types[self.types], expected[self.types])
expected = pd.Series(index=self.types, data=[
4, 2, 1, 1, 1, 1], name='types')
assert_series_equal(
self.dfs.columns_types[self.types], expected[self.types])

def test_column_stats_works_as_expected(self):
column_stats = self.dfs.columns_stats
Expand All @@ -77,8 +81,8 @@ def test_column_stats_works_as_expected(self):
expected[['dcategoricals']] = 3
expected[['dconstant']] = 1
expected[['dmissing']] = 10
assert_series_equal(column_stats[self.columns].loc['uniques'],
expected[self.columns])
assert_series_equal(column_stats[self.columns].loc['uniques'].sort_index(),
expected[self.columns].sort_index(), check_dtype=False)

# missing
expected = pd.Series(index=self.columns,
Expand Down Expand Up @@ -144,7 +148,8 @@ def test_get_perc_works_as_expected(self):

def test_uniques_summary(self):
expected = pd.Series(index=['counts', 'uniques', 'missing', 'missing_perc', 'types'],
data=[self.size, self.size, 0, '0%', DataFrameSummary.TYPE_UNIQUE],
data=[self.size, self.size, 0, '0%',
DataFrameSummary.TYPE_UNIQUE],
name='duniques',
dtype=object)
assert_series_equal(self.dfs['duniques'],
Expand All @@ -165,9 +170,9 @@ def test_bool1_summary(self):
data=[str(count0), perc0, str(count1), perc1,
self.size, 2, 0, '0%', DataFrameSummary.TYPE_BOOL],
name='dbool1',
dtype=object)
dtype=object).sort_index()

assert_series_equal(self.dfs['dbool1'],
assert_series_equal(self.dfs['dbool1'].sort_index(),
expected)

def test_bool2_summary(self):
Expand Down Expand Up @@ -206,9 +211,10 @@ def test_dates_summary(self):
data=[dmax, dmin, dmax - dmin,
self.size, self.size, 0, '0%', DataFrameSummary.TYPE_DATE],
name='ddates',
dtype=object)
dtype=object).sort_index()

assert_series_equal(self.dfs['ddates'],
tmp = self.dfs['ddates'].sort_index()
assert_series_equal(tmp,
expected)

def test_numerics_summary(self):
Expand All @@ -222,7 +228,8 @@ def test_numerics_summary(self):
'deviating_of_median_perc', 'top_correlations', 'counts',
'uniques', 'missing', 'missing_perc', 'types'],
data=[num1.mean(), num1.std(), num1.var(), num1.min(), num1.max(),
num1.quantile(0.05), num1.quantile(0.25), num1.quantile(0.5),
num1.quantile(0.05), num1.quantile(
0.25), num1.quantile(0.5),
num1.quantile(0.75), num1.quantile(0.95),
num1.quantile(0.75) - num1.quantile(0.25),
num1.kurt(), num1.skew(), num1.sum(), num1.mad(),
Expand Down

0 comments on commit d9f785a

Please sign in to comment.