Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
srinivvenkat committed Dec 9, 2020
1 parent 6d09ad5 commit 746646c
Show file tree
Hide file tree
Showing 31 changed files with 18,262 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
129 changes: 129 additions & 0 deletions code/gen_MAPE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pandas as pd
import itertools
import numpy as np
from scipy.stats import norm
from joblib import Parallel, delayed
import multiprocessing
import subprocess
import random
import os
from sklearn.utils.extmath import cartesian
import sys
from datetime import datetime

import argparse

def get_weekly_epicurve(filename, d):
sim_epicurve = pd.read_csv(filename,index_col=0,header=None,delimiter=' ')
return sim_epicurve.groupby(sim_epicurve.columns // d, axis=1).sum()

def get_ids(work_dir, data_horizon, d, nsamp, log, th=None):

if log:
if th is None:
cells = pd.read_csv('{}cells_{}_{}_log.csv'.format(work_dir, data_horizon, d))
else:
cells = pd.read_csv('{}cells_{}_{}_{}_log.csv'.format(work_dir, data_horizon, d, th))
else:
cells = pd.read_csv('{}cells_{}_{}.csv'.format(work_dir, data_horizon, d))
ids = np.random.choice(cells.id, nsamp, True, cells.weight)
if log:
if th is None:
np.savetxt('{}id_{}_{}_log.txt'.format(work_dir, data_horizon, d), ids)
else:
np.savetxt('{}id_{}_{}_{}_log.txt'.format(work_dir, data_horizon, d, th), ids)
else:
np.savetxt('{}id_{}_{}.txt'.format(work_dir, data_horizon, d), ids)
return ids

def get_mape(work_dir, data_horizon, d, ids, ground_truth, ED_frac, log, th=None, look_ahead=4):

FIPS=[36005,36047,36061,36081,36085]

forecast_horizon = data_horizon + look_ahead
forecast_horizon = 30
best_ids = np.unique(ids)
dict_d = {}
for b_id in best_ids:
best_cell = work_dir+'outs/cell_{}.out'.format(b_id)
sim_epicurve = get_weekly_epicurve(best_cell, 7)
sim_epicurve = sim_epicurve.multiply(ED_frac.ED_frac.values,axis=0)

dict_d[b_id] = sim_epicurve.iloc[:,range(forecast_horizon)]

## compute summaries
dff = {}
dfp = pd.DataFrame(columns=[0.05, 0.275, 0.5, 0.725, 0.95], index=FIPS)

for fips in FIPS:
dd = []
dfc = pd.DataFrame(columns=[0.05, 0.275, 0.5, 0.725, 0.95], index=range(dict_d[b_id].shape[1]))
for k in dict_d.keys():
dd.append(dict_d[k].loc[fips])
df = pd.DataFrame(dd, index=dict_d.keys())
for c in dfc.columns:
dfc[c] = df.loc[ids,:].quantile(q=c, axis=0)

dff[fips] = dfc
dfp.loc[fips,:] = df.loc[ids,:].idxmax(axis=1).quantile([0.05, 0.275, 0.5, 0.725, 0.95])

pred_df = pd.DataFrame(columns=range(data_horizon, forecast_horizon), index=FIPS)
for f in FIPS:
pred_df.loc[f,:] = dff[f].loc[range(data_horizon, forecast_horizon), 0.5]
diff_df = abs(pred_df - ground_truth.loc[:,range(data_horizon, forecast_horizon)])
mape_df = diff_df.divide(ground_truth.iloc[:,range(data_horizon, forecast_horizon)])

if log:
if th is None:
np.save('{}forecast_{}_{}_log.npy'.format(work_dir, data_horizon, d), dff)
np.save('{}peak_forecast_{}_{}_log.npy'.format(work_dir, data_horizon, d), dfp)
else:
np.save('{}forecast_{}_{}_{}_log_30.npy'.format(work_dir, data_horizon, d, th), dff)
np.save('{}peak_forecast_{}_{}_{}_log_30.npy'.format(work_dir, data_horizon, d, th), dfp)
else:
np.save('{}forecast_{}_{}.npy'.format(work_dir, data_horizon, d), dff)
np.save('{}peak_forecast_{}_{}.npy'.format(work_dir, data_horizon, d), dfp)

return mape_df

def main(log):

parser = argparse.ArgumentParser()
parser.add_argument('-s', '--season')
parser.add_argument('-ddir', '--work_dir')
parser.add_argument('-cumd', '--cum_days')
parser.add_argument('-t', '--th')

args = parser.parse_args()
th=args.th

##ED_ILIplus is weekly
datadir = 'data/surveillance'
county_fips = pd.DataFrame({'FIPS':[36005,36047,36061,36081,36085],
'Borough':['Bronx','Brooklyn','Manhattan','Queens','Staten Island'],
'color':['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00']})
ground_truth = pd.read_csv(datadir+'nyc_ED_ILIplus_{}.csv'.format(args.season),index_col=0)
ground_truth.columns = range(len(ground_truth.columns))

ED_frac = pd.read_csv(datadir+'nyc_ED_frac_{}.csv'.format(args.season),index_col=0)
adj_ground_truth = ground_truth.divide(ED_frac.ED_frac.values,axis=0)


mape_dff = pd.DataFrame(columns=range(len(np.arange(10, 27, 2))), index=county_fips.FIPS)
k = 0
for data_horizon in np.arange(10, 27, 2):
ids = get_ids(args.work_dir, data_horizon, args.cum_days, 100000, log=True, th=th)
mape_df = get_mape(args.work_dir, data_horizon, args.cum_days, ids, ground_truth, ED_frac, log=True, th=th)
mape_dff.iloc[:,k] = mape_df.mean(1).tolist()
k = k + 1
write_mape=True
if write_mape:
if log:
if th is None:
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'_log.csv')
else:
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'_'+str(th)+'_log.csv')
else:
mape_dff.to_csv(args.work_dir+'mape_all_'+ str(args.cum_days)+'.csv')

main(log=True)
118 changes: 118 additions & 0 deletions code/gen_seasonal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import pandas as pd
import itertools
import numpy as np
from scipy.stats import norm
import scipy.stats
from joblib import Parallel, delayed
import multiprocessing
import subprocess
import random
import os
from sklearn.utils.extmath import cartesian
import sys
from datetime import datetime
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-s', '--season')
parser.add_argument('-wd', '--work_dir')
parser.add_argument('-l', '--label')

args = parser.parse_args()

datadir = 'data/surveillance'
county_fips = pd.DataFrame({'FIPS':[36005,36047,36061,36081,36085],
'Borough':['Bronx','Brooklyn','Manhattan','Queens','Staten Island'],
'color':['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00']})


def read_ground_truth(season):
ground_truth = pd.read_csv(datadir+'nyc_ED_ILIplus_{}.csv'.format(season),index_col=0)
ground_truth.columns = range(len(ground_truth.columns))

ED_frac = pd.read_csv(datadir+'nyc_ED_frac_{}.csv'.format(season),index_col=0)
adj_ground_truth = ground_truth.divide(ED_frac.ED_frac.values,axis=0)

return ED_frac, adj_ground_truth

def get_epicurve(filename, end_week=30, d=7):
sim_epicurve = pd.read_csv(filename,index_col=0,header=None,delimiter=' ')
sim_epicurve = sim_epicurve.loc[[36005, 36047, 36061, 36081, 36085],:]
sim_epicurve = sim_epicurve.iloc[:,range((end_week-1)*7)]
sim_epicurve = sim_epicurve.groupby(sim_epicurve.columns // d, axis=1).sum()

return sim_epicurve

def get_ids(score_file):
cells = pd.read_csv(score_file)
ids = np.random.choice(cells.id, 1000, True, cells.weight)

return ids

def get_best_training(work_dir, b_ids, adj_ground_truth, data_horizon):
s = {}
for b_id in b_ids:
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id))
# s[b_id] = abs(sim_epicurve.iloc[:,range(data_horizon)] - adj_ground_truth.iloc[:,range(data_horizon)]).sum().sum()
s[b_id] = np.mean(np.mean(abs(sim_epicurve.iloc[:,range(data_horizon)] - adj_ground_truth.iloc[:,range(data_horizon)])/adj_ground_truth.iloc[:,range(data_horizon)]))

return s

def get_pred_score(work_dir, b_id, adj_ground_truth, data_horizon, look_ahead=4):
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id))
a = np.mean(np.mean(abs(sim_epicurve.iloc[:,range(data_horizon+1, data_horizon+look_ahead)] - adj_ground_truth.iloc[:,range(data_horizon+1, data_horizon+look_ahead)])/adj_ground_truth.iloc[:,range(data_horizon+1, data_horizon+look_ahead)]))

return a

def get_peak(work_dir, b_ids):
ptime = {}
psize = {}
k = 0
for b_id in b_ids:
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id))
ptime[k] = sim_epicurve.idxmax(1)
psize[k] = sim_epicurve.max(1)
k = k + 1

return pd.DataFrame(ptime), pd.DataFrame(psize)

def get_onset(work_dir, b_ids, adj_ground_truth, onset_th=0.1):
on = []
s = adj_ground_truth.max(1) * onset_th
for b_id in b_ids:
on_c = []
for f in [36005,36047,36061,36081,36085]:
sim_epicurve = get_epicurve('{}outs/cell_{}.out'.format(work_dir, b_id))
ss = sim_epicurve.loc[f,:] > s[f]
on_c.append(ss.idxmax())
on.append(on_c)

on_df = pd.DataFrame(on)
on_df.columns = [36005,36047,36061,36081,36085]

return on_df

season = int(args.season)
work_dir = args.work_dir
label = args.label

ED_frac, adj_ground_truth = read_ground_truth(season)
peak_time = {}
peak_size = {}
onset = {}
for data_horizon in [10,12,14,16,18,20,22,24,26]:
ids = get_ids('{}cells_{}_7_0.0_log.csv'.format(work_dir, data_horizon))
ptime_df, psize_df = get_peak(work_dir, ids)
on_df = get_onset(work_dir, ids, adj_ground_truth, onset_th=0.1)

peak_time[data_horizon] = ptime_df.transpose()
peak_size[data_horizon] = psize_df.transpose()
onset[data_horizon] = on_df


d = {}
d['peakT'] = peak_time
d['peakS'] = peak_size
d['onsetT'] = onset

np.save('{}_{}_seasonal_target.npy'.format(season,label), d)
101 changes: 101 additions & 0 deletions code/patchRun_AUS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import pandas as pd
import itertools
import numpy as np
import multiprocessing
from joblib import Parallel, delayed
import subprocess
import random
import os
from sklearn.utils.extmath import cartesian
import sys

from datetime import datetime

sys.path.append('/projects/PatchSim/') ## path to clone of NSSAC/PatchSim
import patchsim as sim

datadir = 'data/surveillance'
state_id = pd.DataFrame({'ID':['NSW', 'Qld', 'SA', 'Tas', 'Vic', 'WA', 'ACT', 'NT'],
'State':['New South Wales','Queensland','South Australia','Tasmania','Victoria', 'Western Australia', 'Australian Capital Territory', 'Northern Territory'],
})
season = 2016
suffix = '_RAD' ## mobility model
config_template = 'cfg_AUS'+suffix
work_dir = str(datetime.now().month)+str(datetime.now().day)+'_AUS'+suffix+'_{}/'.format(season)

if work_dir[:-1] in os.listdir('.'):
command = "rm -R {}".format(work_dir)
subprocess.call(command, shell=True)

command = "mkdir {}".format(work_dir)
subprocess.call(command, shell=True)

### uniform sampling of 100000 particles
N = 100000
Betas = np.random.uniform(0.3,0.9,size=N)
Alphas = np.random.uniform(0.3,0.9,size=N)
Gammas = np.random.uniform(0.3,0.9,size=N)
SeedThreshold = np.random.choice(range(10,100), size=N, replace=True)
SeedPatchCount = np.random.choice(range(6,8), size=N, replace=True)

scaling_df = pd.DataFrame(np.random.uniform(0.01, 0.02, size = (N,7)))
scaling_df.columns = ['NSW', 'Qld', 'SA', 'Tas', 'Vic', 'WA', 'NT']

cells = pd.DataFrame(np.array([Betas, Alphas, Gammas, SeedThreshold, SeedPatchCount]))
cells = pd.concat([cells.transpose(), scaling_df], axis=1, ignore_index=True)
cells.columns = ['beta','alpha','gamma','seedT','seedN','scaling1','scaling2','scaling3','scaling4','scaling5','scaling6','scaling7']

cells.index.name='id'
cells.to_csv(work_dir+'cells.csv')
scaling_df.index.name='id'
scaling_df.to_csv(work_dir+'scaling.csv')

if 'cfgs' not in os.listdir(work_dir):
os.mkdir(work_dir+'cfgs')

if 'logs' not in os.listdir(work_dir):
os.mkdir(work_dir+'logs')

with open(config_template,'r') as f:
cfg_template=f.read()
cfg_workdir = cfg_template.replace('$work_dir',str(work_dir))

for index,row in cells.iterrows():
cfg_copy = cfg_workdir
cfg_copy = cfg_copy.replace('$id',str(index))
for param in cells.columns:
cfg_copy = cfg_copy.replace('${}'.format(param),str(row['{}'.format(param)]))
with open(work_dir+'cfgs/cfg_{}'.format(index),'w') as f:
f.write(cfg_copy)

## ground truth
ground_truth = pd.read_csv(datadir+'aus_flupositive_2016.csv')
ground_truth = ground_truth.pivot(index='State', columns='Date', values='Count')
ground_truth.columns = range(len(ground_truth.columns))

if 'seeds' not in os.listdir(work_dir):
os.mkdir(work_dir+'seeds')

def gen_seed(i, ground_truth, cells, scaling_df):
seedT=int(cells['seedT'][i])
seedN=int(cells['seedN'][i])
scaling=scaling_df.loc[i,:]
seed_times = ground_truth.applymap(lambda x: x>seedT).idxmax(axis=1).reset_index()
seeds = seed_times.sort_values(0).reset_index(drop=True).loc[0:seedN-1]
seeds['Count'] = seeds.apply(lambda row: ground_truth.loc[row['State'],row[0]],axis=1)
seeds['Count'] = np.divide(np.array(seeds['Count']), np.array(scaling[seeds['State']]))

seeds.columns = ['State','Day','Count']
seeds['Day']*=7
seeds[['Day','State','Count']].to_csv(work_dir+'seeds/seed_{}.csv'.format(i),
index=None,header=None,sep=' ')
num_cores = 40
results = Parallel(n_jobs=num_cores)(delayed(gen_seed)(i,ground_truth,cells,scaling_df) for i in range(N))

if 'outs' not in os.listdir(work_dir):
os.mkdir(work_dir+'outs')
def run_patch(cfg):
sim.run_disease_simulation(sim.read_config(cfg),write_epi=True)
cfgs = [work_dir+'cfgs/{}'.format(x) for x in os.listdir(work_dir+'cfgs/') if 'cfg_' in x]
num_cores = 40
results = Parallel(n_jobs=num_cores)(delayed(run_patch)(cfg) for cfg in cfgs)
Loading

0 comments on commit 746646c

Please sign in to comment.