forked from aatishb/indiatestpositivitydata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getdistrictdata.py
71 lines (57 loc) · 3.04 KB
/
getdistrictdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# script to parse India district level COVID test positivity data from https://www.mohfw.gov.in/
# the data is appended to districtdata.csv and archived in the archive folder
import pandas as pd
from datetime import datetime
import pytz
from urllib import request
from urllib.error import HTTPError
# get current date in IST
IST = pytz.timezone('Asia/Kolkata')
today = datetime.now(IST)
print('checking for new district data on', today)
# https://stackoverflow.com/a/20007730
ordinal = lambda n: "%d%s" % (n,"tsnrhtdd"[(n//10%10!=1)*(n%10<4)*n%10::4])
monthname = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
date = ordinal(today.day) + monthname[today.month - 1]
currentdate = str(today.date())
csv = pd.read_csv('districtdata.csv', header=0)
prevdate = csv.iloc[-1,0]
if (prevdate != currentdate):
try:
url = 'https://www.mohfw.gov.in/pdf/COVID19DistrictWisePositivityAnalysis' + date + '.xlsx'
df = pd.read_excel(url, engine = 'openpyxl')
# find first row of data and header row
firstrow = df[df.isin(['ANDAMAN AND NICOBAR ISLANDS']).any(axis=1)].index[0]
headerrow = firstrow - 1
# set header to header row
df.columns = df.iloc[headerrow]
# start data from first row of data
df = df[df.index >= firstrow]
# rename duplicate columns
columns = []
count = {}
for c in df.columns:
if c in columns:
columns.append(c + '.' + str(count[c]))
count[c] += 1
else:
columns.append(c)
count[c] = 1;
df.columns = columns
over10percent = df[['State','District', 'Positivity']].dropna(thresh=1).iloc[:-1 , :].fillna(method='ffill')
between5and10percent = df[['State.1','District.1', 'Positivity.1']].dropna(thresh=1).iloc[:-1 , :].fillna(method='ffill').rename(columns={"State.1": "State", "District.1": "District", "Positivity.1": "Positivity"})
under5percent = df[['State.2','District.2', 'Positivity.2']].dropna(thresh=1).iloc[:-1 , :].fillna(method='ffill').rename(columns={"State.2": "State", "District.2": "District", "Positivity.2": "Positivity"})
output = pd.concat([over10percent, between5and10percent, under5percent]).sort_values(by=['State', 'District']).rename(columns = {'Positivity': 'Test Positivity Rate'})
output['Date'] = currentdate
output.to_csv("districtdata.csv", columns = ['Date', 'State', 'District', 'Test Positivity Rate'], header = False, index = False, mode='a')
print('added district data for', currentdate, 'to districtdata.csv')
path = 'archive/' + url.split('/')[-1]
request.urlretrieve(url, path)
print('saved district data for', currentdate, 'to archive folder')
except HTTPError as err:
if err.code == 404:
print('district data has not yet been updated for', currentdate)
else:
print(err)
else:
print('already have district data for', currentdate)