-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6d09ad5
commit 746646c
Showing
31 changed files
with
18,262 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import pandas as pd | ||
import itertools | ||
import numpy as np | ||
from scipy.stats import norm | ||
from joblib import Parallel, delayed | ||
import multiprocessing | ||
import subprocess | ||
import random | ||
import os | ||
from sklearn.utils.extmath import cartesian | ||
import sys | ||
from datetime import datetime | ||
|
||
import argparse | ||
|
||
def get_weekly_epicurve(filename, d): | ||
sim_epicurve = pd.read_csv(filename,index_col=0,header=None,delimiter=' ') | ||
return sim_epicurve.groupby(sim_epicurve.columns // d, axis=1).sum() | ||
|
||
def get_ids(work_dir, data_horizon, d, nsamp, log, th=None): | ||
|
||
if log: | ||
if th is None: | ||
cells = pd.read_csv('{}cells_{}_{}_log.csv'.format(work_dir, data_horizon, d)) | ||
else: | ||
cells = pd.read_csv('{}cells_{}_{}_{}_log.csv'.format(work_dir, data_horizon, d, th)) | ||
else: | ||
cells = pd.read_csv('{}cells_{}_{}.csv'.format(work_dir, data_horizon, d)) | ||
ids = np.random.choice(cells.id, nsamp, True, cells.weight) | ||
if log: | ||
if th is None: | ||
np.savetxt('{}id_{}_{}_log.txt'.format(work_dir, data_horizon, d), ids) | ||
else: | ||
np.savetxt('{}id_{}_{}_{}_log.txt'.format(work_dir, data_horizon, d, th), ids) | ||
else: | ||
np.savetxt('{}id_{}_{}.txt'.format(work_dir, data_horizon, d), ids) | ||
return ids | ||
|
||
def get_mape(work_dir, data_horizon, d, ids, ground_truth, ED_frac, log, th=None, look_ahead=4): | ||
|
||
FIPS=[36005,36047,36061,36081,36085] | ||
|
||
forecast_horizon = data_horizon + look_ahead | ||
forecast_horizon = 30 | ||
best_ids = np.unique(ids) | ||
dict_d = {} | ||
for b_id in best_ids: | ||
best_cell = work_dir+'outs/cell_{}.out'.format(b_id) | ||
sim_epicurve = get_weekly_epicurve(best_cell, 7) | ||
sim_epicurve = sim_epicurve.multiply(ED_frac.ED_frac.values,axis=0) | ||
|
||
dict_d[b_id] = sim_epicurve.iloc[:,range(forecast_horizon)] | ||
|
||
## compute summaries | ||
dff = {} | ||
dfp = pd.DataFrame(columns=[0.05, 0.275, 0.5, 0.725, 0.95], index=FIPS) | ||
|
||
for fips in FIPS: | ||
dd = [] | ||
dfc = pd.DataFrame(columns=[0.05, 0.275, 0.5, 0.725, 0.95], index=range(dict_d[b_id].shape[1])) | ||
for k in dict_d.keys(): | ||
dd.append(dict_d[k].loc[fips]) | ||
df = pd.DataFrame(dd, index=dict_d.keys()) | ||
for c in dfc.columns: | ||
dfc[c] = df.loc[ids,:].quantile(q=c, axis=0) | ||
|
||
dff[fips] = dfc | ||
dfp.loc[fips,:] = df.loc[ids,:].idxmax(axis=1).quantile([0.05, 0.275, 0.5, 0.725, 0.95]) | ||
|
||
pred_df = pd.DataFrame(columns=range(data_horizon, forecast_horizon), index=FIPS) | ||
for f in FIPS: | ||
pred_df.loc[f,:] = dff[f].loc[range(data_horizon, forecast_horizon), 0.5] | ||
diff_df = abs(pred_df - ground_truth.loc[:,range(data_horizon, forecast_horizon)]) | ||
mape_df = diff_df.divide(ground_truth.iloc[:,range(data_horizon, forecast_horizon)]) | ||
|
||
if log: | ||
if th is None: | ||
np.save('{}forecast_{}_{}_log.npy'.format(work_dir, data_horizon, d), dff) | ||
np.save('{}peak_forecast_{}_{}_log.npy'.format(work_dir, data_horizon, d), dfp) | ||
else: | ||
np.save('{}forecast_{}_{}_{}_log_30.npy'.format(work_dir, data_horizon, d, th), dff) | ||
np.save('{}peak_forecast_{}_{}_{}_log_30.npy'.format(work_dir, data_horizon, d, th), dfp) | ||
else: | ||
np.save('{}forecast_{}_{}.npy'.format(work_dir, data_horizon, d), dff) | ||
np.save('{}peak_forecast_{}_{}.npy'.format(work_dir, data_horizon, d), dfp) | ||
|
||
return mape_df | ||
|
||
def main(log): | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-s', '--season') | ||
parser.add_argument('-ddir', '--work_dir') | ||
parser.add_argument('-cumd', '--cum_days') | ||
parser.add_argument('-t', '--th') | ||
|
||
args = parser.parse_args() | ||
th=args.th | ||
|
||
##ED_ILIplus is weekly | ||
datadir = 'data/surveillance' | ||
county_fips = pd.DataFrame({'FIPS':[36005,36047,36061,36081,36085], | ||
'Borough':['Bronx','Brooklyn','Manhattan','Queens','Staten Island'], | ||
'color':['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00']}) | ||
ground_truth = pd.read_csv(datadir+'nyc_ED_ILIplus_{}.csv'.format(args.season),index_col=0) | ||
ground_truth.columns = range(len(ground_truth.columns)) | ||
|
||
ED_frac = pd.read_csv(datadir+'nyc_ED_frac_{}.csv'.format(args.season),index_col=0) | ||
adj_ground_truth = ground_truth.divide(ED_frac.ED_frac.values,axis=0) | ||
|
||
|
||
mape_dff = pd.DataFrame(columns=range(len(np.arange(10, 27, 2))), index=county_fips.FIPS) | ||
k = 0 | ||
for data_horizon in np.arange(10, 27, 2): | ||
ids = get_ids(args.work_dir, data_horizon, args.cum_days, 100000, log=True, th=th) | ||
mape_df = get_mape(args.work_dir, data_horizon, args.cum_days, ids, ground_truth, ED_frac, log=True, th=th) | ||
mape_dff.iloc[:,k] = mape_df.mean(1).tolist() | ||
k = k + 1 | ||
write_mape=True | ||
if write_mape: | ||
if log: | ||
if th is None: | ||
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'_log.csv') | ||
else: | ||
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'_'+str(th)+'_log.csv') | ||
else: | ||
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'.csv') | ||
|
||
main(log=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import pandas as pd | ||
import itertools | ||
import numpy as np | ||
from scipy.stats import norm | ||
import scipy.stats | ||
from joblib import Parallel, delayed | ||
import multiprocessing | ||
import subprocess | ||
import random | ||
import os | ||
from sklearn.utils.extmath import cartesian | ||
import sys | ||
from datetime import datetime | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-s', '--season') | ||
parser.add_argument('-wd', '--work_dir') | ||
parser.add_argument('-l', '--label') | ||
|
||
args = parser.parse_args() | ||
|
||
datadir = 'data/surveillance' | ||
county_fips = pd.DataFrame({'FIPS':[36005,36047,36061,36081,36085], | ||
'Borough':['Bronx','Brooklyn','Manhattan','Queens','Staten Island'], | ||
'color':['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00']}) | ||
|
||
|
||
def read_ground_truth(season): | ||
ground_truth = pd.read_csv(datadir+'nyc_ED_ILIplus_{}.csv'.format(season),index_col=0) | ||
ground_truth.columns = range(len(ground_truth.columns)) | ||
|
||
ED_frac = pd.read_csv(datadir+'nyc_ED_frac_{}.csv'.format(season),index_col=0) | ||
adj_ground_truth = ground_truth.divide(ED_frac.ED_frac.values,axis=0) | ||
|
||
return ED_frac, adj_ground_truth | ||
|
||
def get_epicurve(filename, end_week=30, d=7): | ||
sim_epicurve = pd.read_csv(filename,index_col=0,header=None,delimiter=' ') | ||
sim_epicurve = sim_epicurve.loc[[36005, 36047, 36061, 36081, 36085],:] | ||
sim_epicurve = sim_epicurve.iloc[:,range((end_week-1)*7)] | ||
sim_epicurve = sim_epicurve.groupby(sim_epicurve.columns // d, axis=1).sum() | ||
|
||
return sim_epicurve | ||
|
||
def get_ids(score_file): | ||
cells = pd.read_csv(score_file) | ||
ids = np.random.choice(cells.id, 1000, True, cells.weight) | ||
|
||
return ids | ||
|
||
def get_best_training(work_dir, b_ids, adj_ground_truth, data_horizon): | ||
s = {} | ||
for b_id in b_ids: | ||
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id)) | ||
# s[b_id] = abs(sim_epicurve.iloc[:,range(data_horizon)] - adj_ground_truth.iloc[:,range(data_horizon)]).sum().sum() | ||
s[b_id] = np.mean(np.mean(abs(sim_epicurve.iloc[:,range(data_horizon)] - adj_ground_truth.iloc[:,range(data_horizon)])/adj_ground_truth.iloc[:,range(data_horizon)])) | ||
|
||
return s | ||
|
||
def get_pred_score(work_dir, b_id, adj_ground_truth, data_horizon, look_ahead=4): | ||
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id)) | ||
a = np.mean(np.mean(abs(sim_epicurve.iloc[:,range(data_horizon+1, data_horizon+look_ahead)] - adj_ground_truth.iloc[:,range(data_horizon+1, data_horizon+look_ahead)])/adj_ground_truth.iloc[:,range(data_horizon+1, data_horizon+look_ahead)])) | ||
|
||
return a | ||
|
||
def get_peak(work_dir, b_ids): | ||
ptime = {} | ||
psize = {} | ||
k = 0 | ||
for b_id in b_ids: | ||
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id)) | ||
ptime[k] = sim_epicurve.idxmax(1) | ||
psize[k] = sim_epicurve.max(1) | ||
k = k + 1 | ||
|
||
return pd.DataFrame(ptime), pd.DataFrame(psize) | ||
|
||
def get_onset(work_dir, b_ids, adj_ground_truth, onset_th=0.1): | ||
on = [] | ||
s = adj_ground_truth.max(1) * onset_th | ||
for b_id in b_ids: | ||
on_c = [] | ||
for f in [36005,36047,36061,36081,36085]: | ||
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id)) | ||
ss = sim_epicurve.loc[f,:] > s[f] | ||
on_c.append(ss.idxmax()) | ||
on.append(on_c) | ||
|
||
on_df = pd.DataFrame(on) | ||
on_df.columns = [36005,36047,36061,36081,36085] | ||
|
||
return on_df | ||
|
||
season = int(args.season) | ||
work_dir = args.work_dir | ||
label = args.label | ||
|
||
ED_frac, adj_ground_truth = read_ground_truth(season) | ||
peak_time = {} | ||
peak_size = {} | ||
onset = {} | ||
for data_horizon in [10,12,14,16,18,20,22,24,26]: | ||
ids = get_ids('{}cells_{}_7_0.0_log.csv'.format(work_dir, data_horizon)) | ||
ptime_df, psize_df = get_peak(work_dir, ids) | ||
on_df = get_onset(work_dir, ids, adj_ground_truth, onset_th=0.1) | ||
|
||
peak_time[data_horizon] = ptime_df.transpose() | ||
peak_size[data_horizon] = psize_df.transpose() | ||
onset[data_horizon] = on_df | ||
|
||
|
||
d = {} | ||
d['peakT'] = peak_time | ||
d['peakS'] = peak_size | ||
d['onsetT'] = onset | ||
|
||
np.save('{}_{}_seasonal_target.npy'.format(season,label), d) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import pandas as pd | ||
import itertools | ||
import numpy as np | ||
import multiprocessing | ||
from joblib import Parallel, delayed | ||
import subprocess | ||
import random | ||
import os | ||
from sklearn.utils.extmath import cartesian | ||
import sys | ||
|
||
from datetime import datetime | ||
|
||
sys.path.append('/projects/PatchSim/') ## path to clone of NSSAC/PatchSim | ||
import patchsim as sim | ||
|
||
datadir = 'data/surveillance' | ||
state_id = pd.DataFrame({'ID':['NSW', 'Qld', 'SA', 'Tas', 'Vic', 'WA', 'ACT', 'NT'], | ||
'State':['New South Wales','Queensland','South Australia','Tasmania','Victoria', 'Western Australia', 'Australian Capital Territory', 'Northern Territory'], | ||
}) | ||
season = 2016 | ||
suffix = '_RAD' ## mobility model | ||
config_template = 'cfg_AUS'+suffix | ||
work_dir = str(datetime.now().month)+str(datetime.now().day)+'_AUS'+suffix+'_{}/'.format(season) | ||
|
||
if work_dir[:-1] in os.listdir('.'): | ||
command = "rm -R {}".format(work_dir) | ||
subprocess.call(command, shell=True) | ||
|
||
command = "mkdir {}".format(work_dir) | ||
subprocess.call(command, shell=True) | ||
|
||
### uniform sampling of 100000 particles | ||
N = 100000 | ||
Betas = np.random.uniform(0.3,0.9,size=N) | ||
Alphas = np.random.uniform(0.3,0.9,size=N) | ||
Gammas = np.random.uniform(0.3,0.9,size=N) | ||
SeedThreshold = np.random.choice(range(10,100), size=N, replace=True) | ||
SeedPatchCount = np.random.choice(range(6,8), size=N, replace=True) | ||
|
||
scaling_df = pd.DataFrame(np.random.uniform(0.01, 0.02, size = (N,7))) | ||
scaling_df.columns = ['NSW', 'Qld', 'SA', 'Tas', 'Vic', 'WA', 'NT'] | ||
|
||
cells = pd.DataFrame(np.array([Betas, Alphas, Gammas, SeedThreshold, SeedPatchCount])) | ||
cells = pd.concat([cells.transpose(), scaling_df], axis=1, ignore_index=True) | ||
cells.columns = ['beta','alpha','gamma','seedT','seedN','scaling1','scaling2','scaling3','scaling4','scaling5','scaling6','scaling7'] | ||
|
||
cells.index.name='id' | ||
cells.to_csv(work_dir+'cells.csv') | ||
scaling_df.index.name='id' | ||
scaling_df.to_csv(work_dir+'scaling.csv') | ||
|
||
if 'cfgs' not in os.listdir(work_dir): | ||
os.mkdir(work_dir+'cfgs') | ||
|
||
if 'logs' not in os.listdir(work_dir): | ||
os.mkdir(work_dir+'logs') | ||
|
||
with open(config_template,'r') as f: | ||
cfg_template=f.read() | ||
cfg_workdir = cfg_template.replace('$work_dir',str(work_dir)) | ||
|
||
for index,row in cells.iterrows(): | ||
cfg_copy = cfg_workdir | ||
cfg_copy = cfg_copy.replace('$id',str(index)) | ||
for param in cells.columns: | ||
cfg_copy = cfg_copy.replace('${}'.format(param),str(row['{}'.format(param)])) | ||
with open(work_dir+'cfgs/cfg_{}'.format(index),'w') as f: | ||
f.write(cfg_copy) | ||
|
||
## ground truth | ||
ground_truth = pd.read_csv(datadir+'aus_flupositive_2016.csv') | ||
ground_truth = ground_truth.pivot(index='State', columns='Date', values='Count') | ||
ground_truth.columns = range(len(ground_truth.columns)) | ||
|
||
if 'seeds' not in os.listdir(work_dir): | ||
os.mkdir(work_dir+'seeds') | ||
|
||
def gen_seed(i, ground_truth, cells, scaling_df): | ||
seedT=int(cells['seedT'][i]) | ||
seedN=int(cells['seedN'][i]) | ||
scaling=scaling_df.loc[i,:] | ||
seed_times = ground_truth.applymap(lambda x: x>seedT).idxmax(axis=1).reset_index() | ||
seeds = seed_times.sort_values(0).reset_index(drop=True).loc[0:seedN-1] | ||
seeds['Count'] = seeds.apply(lambda row: ground_truth.loc[row['State'],row[0]],axis=1) | ||
seeds['Count'] = np.divide(np.array(seeds['Count']), np.array(scaling[seeds['State']])) | ||
|
||
seeds.columns = ['State','Day','Count'] | ||
seeds['Day']*=7 | ||
seeds[['Day','State','Count']].to_csv(work_dir+'seeds/seed_{}.csv'.format(i), | ||
index=None,header=None,sep=' ') | ||
num_cores = 40 | ||
results = Parallel(n_jobs=num_cores)(delayed(gen_seed)(i,ground_truth,cells,scaling_df) for i in range(N)) | ||
|
||
if 'outs' not in os.listdir(work_dir): | ||
os.mkdir(work_dir+'outs') | ||
def run_patch(cfg): | ||
sim.run_disease_simulation(sim.read_config(cfg),write_epi=True) | ||
cfgs = [work_dir+'cfgs/{}'.format(x) for x in os.listdir(work_dir+'cfgs/') if 'cfg_' in x] | ||
num_cores = 40 | ||
results = Parallel(n_jobs=num_cores)(delayed(run_patch)(cfg) for cfg in cfgs) |
Oops, something went wrong.