Skip to content

Commit

Permalink
update source links and adapt etl script to new source file format
Browse files Browse the repository at this point in the history
  • Loading branch information
semio committed Jul 18, 2024
1 parent 0decd01 commit 4337f4e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 11 deletions.
8 changes: 4 additions & 4 deletions etl/scripts/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ def extract_economy_entities(countries: pd.DataFrame, domains: pd.DataFrame, gro
sets=sets_list,
props={'name': name})

grouped = groups.groupby(by='CountryCode')
grouped = groups.groupby(by='WB_Country_Code')
for eco, df in grouped:
eco_groups = df['GroupCode'].values.tolist()
eco_groups = df['WB_Group_Code'].values.tolist()
eco_id = to_concept_id(eco)
eco_name = df['CountryName'].unique()
eco_name = df['WB_Country_Name'].unique()
if len(eco_name) > 1:
print(f'Warning: economy {eco} has multiple names: {eco_name}')
props = {'name': eco_name[0]}
Expand Down Expand Up @@ -193,7 +193,7 @@ def main():
keep_default_na=False).dropna(how='all', axis=1)

groups = pd.read_excel(groups_xls,
sheet_name='Groups',
sheet_name='compositions',
na_values=[''],
keep_default_na=False).dropna(how='all')

Expand Down
12 changes: 5 additions & 7 deletions etl/scripts/update_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,19 @@ def download(url, outpath):
# classification files
# see https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
zip_file = "https://databank.worldbank.org/data/download/WDI_CSV.zip"
url_class_xls = 'http://databank.worldbank.org/data/download/site-content/CLASS.xlsx'
url_oghist_xls = 'http://databank.worldbank.org/data/download/site-content/OGHIST.xlsx'
url_class_xls = 'https://datacatalogapi.worldbank.org/ddhxext/ResourceDownload?resource_unique_id=DR0090755'
url_oghist_xls = 'https://datacatalogapi.worldbank.org/ddhxext/ResourceDownload?resource_unique_id=DR0090754'


def update():
# wb = WorldBankLoader()
# print('downloading source data...')
# wb.bulk_download('WDI', source_dir, timeout=60)
print('downloading source data...')
download(zip_file, '../source/WDI_csv.zip')
print('extracting...')
f = ZipFile(os.path.join(source_dir, 'WDI_csv.zip'))
f.extractall(source_dir)
print('downloading CLASS.xls...')
print('downloading CLASS.xlsx...')
download(url_class_xls, os.path.join(source_dir, 'CLASS.xlsx'))
print('downloading OGHIST.xls...')
print('downloading OGHIST.xlsx...')
download(url_oghist_xls, os.path.join(source_dir, 'OGHIST.xlsx'))


Expand Down

0 comments on commit 4337f4e

Please sign in to comment.