-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathunited_states_of_america.py
67 lines (50 loc) · 2.15 KB
/
united_states_of_america.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from urllib import request
from os import getcwd, path
import datetime
from helpers import ensure_dirs
COUNTIES_DATASET = 'https://github.com/nytimes/covid-19-data/raw/master/us-counties.csv'
STATES_DATASET = 'https://github.com/nytimes/covid-19-data/raw/master/us-states.csv'
def scrape_united_states_of_america():
cwd = getcwd()
us_dir = path.join(cwd, 'data', 'united_states_of_america')
tmp_dir = path.join(cwd, 'tmp')
ensure_dirs(us_dir, tmp_dir)
headers = ['date', 'state', 'county',
'place_type', 'fips', 'cases', 'deaths']
counties_df = pd.read_csv(COUNTIES_DATASET)
counties_df = counties_df.sort_values(
by=['state', 'county', 'date'], ascending=[True, True, False])
counties_df['place_type'] = 'county'
counties_df = counties_df[headers]
states_df = pd.read_csv(STATES_DATASET)
states_df = states_df.sort_values(
by=['state', 'date'], ascending=[True, False])
states_df['county'] = ''
states_df['place_type'] = 'state'
states_df = states_df[headers]
states_fips = {}
fipses = states_df['fips'].unique()
for fips in fipses:
is_current_fips = states_df['fips'] == fips
fips_file = path.join(us_dir, f'{fips:02d}.csv')
current_df = states_df[is_current_fips]
current_df.to_csv(fips_file, index=False, float_format='%.f')
state = current_df['state'].iloc[0]
is_same_fips = counties_df['state'] == state
current_counties_df = counties_df[is_same_fips]
current_counties_df.to_csv(
fips_file, index=False, header=False, mode='a', float_format='%.f')
states_fips[f'{fips:02d}'] = state
with open(path.join(us_dir, 'README.md'), 'w') as readme_f:
readme_f.write(get_readme_contents(states_fips))
def get_readme_contents(states):
toc = [
f'| {state} | [`{fips}.csv`]({fips}.csv) |' for fips, state in states.items()]
toc_contents = '\n'.join(toc)
return f"""## United States of America
> Last updated at {datetime.datetime.now(datetime.timezone.utc).strftime('%b %d %Y %H:%M:%S UTC')}.
| State | Dataset |
| ------ | ------- |
{toc_contents}
"""