This repository has been archived by the owner on Aug 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_repeat_donations.py
70 lines (49 loc) · 2.19 KB
/
find_repeat_donations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
# coding: utf-8
from datetime import timedelta
import pandas as pd
def median_day(s):
s = pd.to_datetime(sorted(s))
s = (s - s.min()).to_series().dt.days
return s.median()
def statistics(debug=False):
df = pd.read_csv('all.csv')
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
start_of_last_month = df.date.max().replace(day=1)
prev_30_days = (start_of_last_month - timedelta(days=30))
df_last_month = df[df['date'] >= start_of_last_month]
last_month = df_last_month.loc[:, 'date'].min().strftime('%B') # string of month
df_prev_30_days = df[(df['date'] < start_of_last_month) & (df['date'] >= prev_30_days)]
prev_month = df_prev_30_days.loc[:, 'date'].min().month
id_median = df.groupby('id').agg({'date': tuple}).date.apply(median_day)
maybe_candidates = id_median[(id_median > 10) & (id_median < 40)].index
not_month_periodic = df[~df.id.isin(maybe_candidates)]
part_a = df[df.id.isin(maybe_candidates)].dropna(subset=['income'])
income_vc = part_a.income.value_counts()
interesting_incomes = income_vc[income_vc > 1].index
part_b = part_a[part_a.income.isin(interesting_incomes)].sort_values('date')
c = part_b.groupby(['id', 'income']).date.agg([set, 'count', median_day, 'max'])
repeaters = c[(c['count'] > 1) & (c['max'].dt.month == 4)]['max'].sort_index()
def summary(sdf, prefix):
return {
f'{prefix} income': sdf.income.sum(),
f'{prefix} expense': sdf.expense.sum(),
f'{prefix} net': sdf.income.sum() - sdf.expense.sum(),
}
if repeaters.empty:
ret = {}
else:
ret = dict(**{
'Repeating payments (i.e. probably Members)': repeaters.reset_index()['income'].sum(),
'Repeating payments count (i.e. member number estimate)': repeaters.shape[0],
},
**summary(df_last_month, f'{last_month}'),
**summary(df_prev_30_days, f'[{prev_30_days.date()}..{start_of_last_month.date()})')
)
ret = pd.Series(ret)
if debug:
return ret, df, last_month, prev_30_days
return ret
if __name__ == '__main__':
print(statistics())