-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
formalize preprocessing + incorporate into rest of workflow #1
Changes from all commits
33d401e
ce1d650
3c5d6c6
f965be3
4d90509
78be601
d2d9d1b
f8cb41e
88f5ba0
6eb45f6
121c34a
b202b10
c073b43
95ecbc6
cc71fc2
8267603
ebbc33e
b143cac
3bf9191
074a606
3fb8e79
675bdc9
444c785
afae3cf
28d56ee
c8669dd
f77e5bb
2c9295a
62bf58d
36391cf
eaf9d16
3eecee5
f5fd084
f6dd4af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
target_default: preprocess | ||
|
||
packages: | ||
- vizlab | ||
|
||
sources: | ||
- scripts/preprocess/fetch_s3_object.R | ||
- scripts/preprocess/clean_county_boundaries.R | ||
- scripts/preprocess/save_state_fips.R | ||
- scripts/preprocess/execute_shell_script.R | ||
- scripts/preprocess/push_s3_object.R | ||
|
||
targets: | ||
|
||
# --- fetch --- # | ||
|
||
# takes about 12 minutes | ||
cache/IPUMS_NHGIS_counties.zip: | ||
command: fetch_s3_object(target_name, I("IPUMS_NHGIS_counties.zip"), I("viz-water-use-15")) | ||
|
||
# --- process --- # | ||
|
||
# takes about 45 minutes & about 5.5 GB of disk space for all FIPS | ||
cache/county_boundaries_geojson.zip: | ||
command: clean_county_boundaries(target_name, "cache/IPUMS_NHGIS_counties.zip") | ||
|
||
# takes about 30 seconds | ||
cache/state_fips.csv: | ||
command: save_state_fips(target_name, "cache/county_boundaries_geojson.zip", I("states.json")) | ||
|
||
# takes about 10 minutes & about 3 GB of disk space for all FIPS | ||
cache/county_boundaries_topojson.zip: | ||
command: execute_shell_script(target_name, "cache/county_boundaries_geojson.zip", | ||
I("scripts/preprocess/topo_county_boundaries.sh"), | ||
"cache/state_fips.csv") | ||
|
||
# --- publish --- # | ||
|
||
# takes about 12 minutes | ||
s3boundariesfile: | ||
command: push_s3_object(I("county_boundaries_topojson.zip"), | ||
"cache/county_boundaries_topojson.zip", | ||
I("viz-water-use-15")) | ||
|
||
# --- final --- # | ||
|
||
preprocess: | ||
depends: | ||
- s3boundariesfile |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
fetchTimestamp.s3_object <- vizlab::alwaysCurrent | ||
|
||
fetch.s3_object <- function(viz){ | ||
|
||
args <- viz[["fetch_args"]] | ||
|
||
aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file()) | ||
|
||
# download object from an s3 bucket | ||
object_fn <- aws.s3::save_object(object = args[["object_name"]], | ||
bucket = args[["bucket_name"]], | ||
file = viz[["location"]]) | ||
return(object_fn) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# This file describes how to run `preprocess.yaml`, which is the yaml that | ||
# orchestrates the steps required for preprocessing the county boundary and | ||
# state/county fips data. This should not need to be executed by every | ||
# contributor because the results are stored in the S3 bucket. Most should | ||
# just worry about the viz.yaml. | ||
|
||
# This workflow assumes that you have the required R packages and appropriate | ||
# credentials (with the profile as "default") stored in: | ||
aws.signature::default_credentials_file() | ||
|
||
# required for topo_county_boundaries.sh | ||
# install node.js https://nodejs.org/en/, then run | ||
# npm install -g topojson | ||
|
||
# required R packages: | ||
# | ||
# aws.s3: | ||
# repo: CRAN | ||
# version: 0.3.3 | ||
# aws.signature: | ||
# repo: CRAN | ||
# version: 0.3.5 | ||
# dplyr: | ||
# repo: CRAN | ||
# version: 0.7.4 | ||
# geojsonio: | ||
# repo: CRAN | ||
# version: 0.5.0 | ||
# jsonlite: | ||
# repo: CRAN | ||
# version: 1.5 | ||
# remake: | ||
# repo: github | ||
# version: 0.3.0 | ||
# name: richfitz/remake | ||
# sf: | ||
# repo: CRAN | ||
# version: 0.6.0 | ||
|
||
# run the full preprocesing workflow | ||
# this will take ~ 30 minutes, the longest step is fetching the data from s3 | ||
remake::make(target_names = "preprocess", | ||
remake_file = "preprocess.yaml") | ||
|
||
# run an individual target: | ||
remake::make(target_names = "cache/county_boundaries_topojson.zip", | ||
remake_file = "preprocess.yaml") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
execute_shell_script <- function(location, zipfilepath, shell_script_fn, statecsvpath){ | ||
|
||
cmd <- paste("bash", shell_script_fn, zipfilepath, statecsvpath, location) | ||
system(cmd) | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
fetch_s3_object <- function(location, obj_name, bucket_name){ | ||
|
||
aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file()) | ||
|
||
# download object from an s3 bucket | ||
object_fn <- aws.s3::save_object(object = obj_name, | ||
bucket = bucket_name, | ||
file = location) | ||
return(object_fn) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
push_s3_object <- function(s3_fn, existing_fn, bucket_name) { | ||
|
||
aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file()) | ||
|
||
s3_push <- aws.s3::put_object(file = existing_fn, | ||
object = s3_fn, | ||
bucket = bucket_name) | ||
|
||
return(s3_push) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
save_state_fips <- function(location, zipfilepath, jsonfilepath) { | ||
|
||
# get states.json into cache/ | ||
dir_name <- dirname(zipfilepath) | ||
unzip(zipfilepath, files = jsonfilepath, exdir = dir_name) | ||
|
||
# read json and create vector of just fips | ||
states_info <- jsonlite::fromJSON(file.path(dir_name, jsonfilepath)) | ||
fips <- states_info[["state_FIPS"]] | ||
|
||
# write.csv did not allow col.names to work | ||
write.table(fips, location, sep = ",", col.names = FALSE, row.names = FALSE, quote = FALSE) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
|
||
# create a temp directory | ||
TMP=$(mktemp -d) | ||
|
||
# unzip the geojson | ||
unzip $1 -d $TMP | ||
|
||
# pick out the geojson files (exclude counties.js and states.js) | ||
GJ=$(dir $TMP/*.geojson) | ||
|
||
# list state fips for now | ||
while read fip | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. neat! good job getting this bash loop figured out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the real mvp: @wdwatkins There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💯 |
||
do | ||
|
||
fipfixed=$(echo "$fip" | tr -d '\r') | ||
path="$TMP"/"$fipfixed" | ||
|
||
# convert to topojson | ||
geo2topo \ | ||
state=$path.geojson \ | ||
-o $path.json | ||
|
||
# simplify | ||
toposimplify -s 1e-4 -f \ | ||
$path.json \ | ||
-o $path-simple.json | ||
|
||
# quantize (store as integers, scale later) | ||
topoquantize 1e5 \ | ||
$path-simple.json \ | ||
-o $path-quantized.json | ||
|
||
echo "Finished $fipfixed" | ||
|
||
done < $2 | ||
|
||
echo All done | ||
|
||
# zip back up for storage in cache/ | ||
WD=$(pwd) | ||
cd "$TMP" | ||
zip "$WD/$3" ./*quantized.json states.json counties.json | ||
cd "$WD" | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
publish.multiple_json_files <- function(viz) { | ||
deps <- readDepends(viz) | ||
args <- viz[["publish_args"]] | ||
file_pattern <- args[["pattern"]] | ||
|
||
# unzip if it's a zip file | ||
if(grepl(".zip", deps[["files_location"]])) { | ||
# unzip and cache in a folder before publishing | ||
extract_boundary_files(deps[["files_location"]], file_pattern, viz[["location"]]) | ||
paths_to_use <- list.files(viz[["location"]], full.names = TRUE) | ||
|
||
} else { | ||
# paths are just the files in the passed in location if they aren't zipped | ||
paths_to_use <- list.files(deps[["files_location"]], full.names = TRUE) | ||
} | ||
|
||
for(fp in paths_to_use) { | ||
|
||
# create viz-like item to use in publish | ||
viz_json <- vizlab::as.viz(list(location = fp, mimetype = "application/json")) | ||
|
||
# use publisher to follow typical json publishing steps to get file to target | ||
vizlab::publish(viz_json) | ||
} | ||
|
||
} | ||
|
||
#' Extract files from a zipfile | ||
#' | ||
#' @filepath the name of the .zip file | ||
#' @pattern argument that represents the pattern in filenames to | ||
#' extract with grep | ||
#' @exdir where to extract the zipfiles | ||
extract_boundary_files <- function(zipfile, pattern, exdir) { | ||
|
||
allfiles <- unzip(zipfile=zipfile, list=TRUE)[["Name"]] | ||
boundaryfiles <- allfiles[grep(pattern, allfiles)] | ||
|
||
unzip(zipfile=zipfile, files=boundaryfiles, exdir=exdir, overwrite=TRUE) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# This file only needs to be run one time, by one person, for the whole project. | ||
# Including so it's easier to create buckets again in the future. | ||
|
||
library(aws.signature) | ||
message('check that credentials for dev profile at ', aws.signature::default_credentials_file(), " match those in get_dssecret('dev-owi-s3-access')") | ||
aws.signature::use_credentials(profile='dev', file=aws.signature::default_credentials_file()) | ||
|
||
library(aws.s3) | ||
bucketlist() # to see which buckets are already there | ||
new_bucket_name <- 'viz-water-use-15' # convention: 'viz-' followed by the github repo name for the vizzy | ||
put_bucket(new_bucket_name, region='us-west-2', acl='private') # gives error if bucket already exists | ||
|
||
# this command posted the data (took 1.5 hrs) | ||
put_object(file='data/nhgis0002_shape.zip', object='IPUMS_NHGIS_counties.zip', bucket='viz-water-use-15') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is fine for now but reminds me that we ought to get an s3 fetcher implemented in vizlab proper, so we can use s3 timestamps/hashes to keep everybody's files up to date. USGS-VIZLAB/vizlab#333