Skip to content

Commit

Permalink
update script and settings
Browse files Browse the repository at this point in the history
  • Loading branch information
semio committed Dec 14, 2016
1 parent 67de6b9 commit bd5cfc6
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 142 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
etl/source/WDI_csv

# Tempfiles of OSX/Win
Thumbs.db
.DS_Store
Expand Down
101 changes: 0 additions & 101 deletions etl/script/index.py

This file was deleted.

83 changes: 42 additions & 41 deletions etl/script/wdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import numpy as np
import re
import os
from index import create_index_file
import json
from ddf_utils.index import get_datapackage

# configuration of file path.
source_dir = '../source/WDI_csv/'
Expand Down Expand Up @@ -112,43 +113,43 @@ def extract_datapoints_country_year(data):


if __name__ == '__main__':
print('reading source files...')
data = pd.read_csv(data_csv, encoding='latin', dtype=str)
country = pd.read_csv(country_csv, encoding='latin', dtype=str)
series = pd.read_csv(series_csv, encoding='latin', dtype=str)

print('creating concepts files...')
concept_continuous = extract_concept_continuous(country, series)
concept_continuous.to_csv(
os.path.join(output_dir, 'ddf--concepts--continuous.csv'),
index=False, encoding='utf8')

concept_discrete = extract_concept_discrete(country, series)
concept_discrete.to_csv(
os.path.join(output_dir, 'ddf--concepts--discrete.csv'),
index=False, encoding='utf8')

print('creating entities files...')
entities_country = extract_entities_country(country, series)
entities_country.to_csv(
os.path.join(output_dir, 'ddf--entities--country.csv'),
index=False, encoding='utf8')

print('creating datapoints...')
datapoints = extract_datapoints_country_year(data)
for k, v in datapoints.items():
v[k] = pd.to_numeric(v[k])
v.to_csv(
os.path.join(output_dir,
'ddf--datapoints--'+k+'--by--country--year.csv'),
index=False,
encoding='utf8',
# keep 10 digits. this is to avoid pandas
# use scientific notation in the datapoints
# and also keep precision. There are really
# small/big numbers in this datset.
float_format='%.10f'
)

print('generating index file...')
create_index_file(output_dir, os.path.join(output_dir, 'ddf--index.csv'))
# print('reading source files...')
# data = pd.read_csv(data_csv, encoding='latin', dtype=str)
# country = pd.read_csv(country_csv, encoding='latin', dtype=str)
# series = pd.read_csv(series_csv, encoding='latin', dtype=str)
#
# print('creating concepts files...')
# concept_continuous = extract_concept_continuous(country, series)
# concept_continuous.to_csv(
# os.path.join(output_dir, 'ddf--concepts--continuous.csv'),
# index=False, encoding='utf8')
#
# concept_discrete = extract_concept_discrete(country, series)
# concept_discrete.to_csv(
# os.path.join(output_dir, 'ddf--concepts--discrete.csv'),
# index=False, encoding='utf8')
#
# print('creating entities files...')
# entities_country = extract_entities_country(country, series)
# entities_country.to_csv(
# os.path.join(output_dir, 'ddf--entities--country.csv'),
# index=False, encoding='utf8')
#
# print('creating datapoints...')
# datapoints = extract_datapoints_country_year(data)
# for k, v in datapoints.items():
# v[k] = pd.to_numeric(v[k])
# v.to_csv(
# os.path.join(output_dir,
# 'ddf--datapoints--'+k+'--by--country--year.csv'),
# index=False,
# encoding='utf8',
# # keep 10 digits. this is to avoid pandas
# # use scientific notation in the datapoints
# # and also keep precision. There are really
# # small/big numbers in this datset.
# float_format='%.10f'
# )

print('generating datapackage file...')
datapackage = get_datapackage(output_dir, to_disk=True)

0 comments on commit bd5cfc6

Please sign in to comment.