aappling-usgs · aappling-usgs · Feb 21, 2018 · Feb 13, 2018 · Feb 13, 2018 · Feb 13, 2018
diff --git a/preprocess.yaml b/preprocess.yaml
@@ -0,0 +1,49 @@
+target_default: preprocess
+
+packages:
+  - vizlab
+
+sources:
+  - scripts/preprocess/fetch_s3_object.R
+  - scripts/preprocess/clean_county_boundaries.R
+  - scripts/preprocess/save_state_fips.R
+  - scripts/preprocess/execute_shell_script.R
+  - scripts/preprocess/push_s3_object.R
+
+targets:
+
+  # --- fetch --- #
+
+  # takes about 12 minutes
+  cache/IPUMS_NHGIS_counties.zip:
+    command: fetch_s3_object(target_name, I("IPUMS_NHGIS_counties.zip"), I("viz-water-use-15"))
+
+  # --- process --- #
+
+  # takes about 45 minutes & about 5.5 GB of disk space for all FIPS
+  cache/county_boundaries_geojson.zip:
+    command: clean_county_boundaries(target_name, "cache/IPUMS_NHGIS_counties.zip")
+
+  # takes about 30 seconds
+  cache/state_fips.csv:
+    command: save_state_fips(target_name, "cache/county_boundaries_geojson.zip", I("states.json"))
+
+  # takes about 10 minutes & about 3 GB of disk space for all FIPS
+  cache/county_boundaries_topojson.zip:
+    command: execute_shell_script(target_name, "cache/county_boundaries_geojson.zip", 
+                                  I("scripts/preprocess/topo_county_boundaries.sh"),
+                                  "cache/state_fips.csv")
+
+  # --- publish --- #
+
+  # takes about 12 minutes
+  s3boundariesfile:
+    command: push_s3_object(I("county_boundaries_topojson.zip"), 
+                            "cache/county_boundaries_topojson.zip", 
+                            I("viz-water-use-15"))
+
+  # --- final --- #
+
+  preprocess:
+    depends:
+      - s3boundariesfile
diff --git a/scripts/fetch/s3_object.R b/scripts/fetch/s3_object.R
@@ -0,0 +1,15 @@
+
+fetchTimestamp.s3_object <- vizlab::alwaysCurrent
+
+fetch.s3_object <- function(viz){
+
+  args <- viz[["fetch_args"]]
+
+  aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file())
+
+  # download object from an s3 bucket
+  object_fn <- aws.s3::save_object(object = args[["object_name"]], 
+                                   bucket = args[["bucket_name"]],
+                                   file = viz[["location"]])
+  return(object_fn)
+}
diff --git a/scripts/process/clean_county_boundaries.R → scripts/preprocess/clean_county_boundaries.R b/scripts/process/clean_county_boundaries.R → scripts/preprocess/clean_county_boundaries.R
@@ -1,9 +1,13 @@
 #' Cleans data for historical county polygons.
-process.county_boundaries <- function(viz){
-  deps <- readDepends(viz)
+clean_county_boundaries <- function(location, shp_zip_fn){
+
+  library(sf)
+  library(dplyr)
+  library(geojsonio)
+  library(jsonlite)
 
   # unzip the shapefiles, which are zip files within a zip file
-  map_zip <- deps$county_boundaries_zip
+  map_zip <- shp_zip_fn
   map_dir <- file.path(tempdir(), 'county_boundaries')
   unzip(map_zip, exdir=map_dir)
   map_shp_zips <- dir(dir(map_dir, full.names=TRUE), full.names=TRUE)
@@ -38,7 +42,7 @@ process.county_boundaries <- function(viz){
   counties <- consolidate_county_info(all_shps_simple)
 
   # split the country-wide shapefiles into state-wide shapefiles
-  split_shps <- lapply(setNames(nm=states$state_FIPS[c(1,4)]), function(state_fips) {
+  split_shps <- lapply(setNames(nm=states$state_FIPS), function(state_fips) {
     message('splitting out shapefiles for state ', state_fips)
 
     # subset to just one state
@@ -73,7 +77,7 @@ process.county_boundaries <- function(viz){
   # save to one big zip file
   oldwd <- setwd(geojsondir)
   on.exit(setwd(oldwd))
-  zipfile <- file.path(oldwd, viz[['location']])
+  zipfile <- file.path(oldwd, location)
   if(file.exists(zipfile)) file.remove(zipfile)
   filestozip <- dir()
   zip(zipfile, files=filestozip)

diff --git a/scripts/preprocess/execute_preprocessing.R b/scripts/preprocess/execute_preprocessing.R
@@ -0,0 +1,47 @@
+# This file describes how to run `preprocess.yaml`, which is the yaml that 
+# orchestrates the steps required for preprocessing the county boundary and 
+# state/county fips data. This should not need to be executed by every 
+# contributor because the results are stored in the S3 bucket. Most should 
+# just worry about the viz.yaml. 
+
+# This workflow assumes that you have the required R packages and appropriate 
+# credentials (with the profile as "default") stored in:
+aws.signature::default_credentials_file()
+
+# required for topo_county_boundaries.sh
+# install node.js https://nodejs.org/en/, then run
+# npm install -g topojson
+
+# required R packages:
+#
+# aws.s3:
+#   repo: CRAN
+#   version: 0.3.3
+# aws.signature:
+#   repo: CRAN
+#   version: 0.3.5
+# dplyr:
+#   repo: CRAN
+#   version: 0.7.4
+# geojsonio:
+#   repo: CRAN
+#   version: 0.5.0
+# jsonlite:
+#   repo: CRAN
+#   version: 1.5
+# remake:
+#   repo: github
+#   version: 0.3.0
+#   name: richfitz/remake
+# sf:
+#   repo: CRAN
+#   version: 0.6.0
+
+# run the full preprocesing workflow
+# this will take ~ 30 minutes, the longest step is fetching the data from s3
+remake::make(target_names = "preprocess", 
+             remake_file = "preprocess.yaml")
+
+# run an individual target:
+remake::make(target_names = "cache/county_boundaries_topojson.zip", 
+             remake_file = "preprocess.yaml")
diff --git a/scripts/preprocess/execute_shell_script.R b/scripts/preprocess/execute_shell_script.R
@@ -0,0 +1,6 @@
+execute_shell_script <- function(location, zipfilepath, shell_script_fn, statecsvpath){
+
+  cmd <- paste("bash", shell_script_fn, zipfilepath, statecsvpath, location)
+  system(cmd)
+
+}
diff --git a/scripts/preprocess/fetch_s3_object.R b/scripts/preprocess/fetch_s3_object.R
@@ -0,0 +1,10 @@
+fetch_s3_object <- function(location, obj_name, bucket_name){
+
+  aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file())
+
+  # download object from an s3 bucket
+  object_fn <- aws.s3::save_object(object = obj_name, 
+                                   bucket = bucket_name,
+                                   file = location)
+  return(object_fn)
+}
diff --git a/scripts/preprocess/push_s3_object.R b/scripts/preprocess/push_s3_object.R
@@ -0,0 +1,10 @@
+push_s3_object <- function(s3_fn, existing_fn, bucket_name) {
+
+  aws.signature::use_credentials(profile='default', file=aws.signature::default_credentials_file())
+
+  s3_push <- aws.s3::put_object(file = existing_fn, 
+                                object = s3_fn, 
+                                bucket = bucket_name)
+
+  return(s3_push)  
+}
diff --git a/scripts/preprocess/save_state_fips.R b/scripts/preprocess/save_state_fips.R
@@ -0,0 +1,13 @@
+save_state_fips <- function(location, zipfilepath, jsonfilepath) {
+
+  # get states.json into cache/
+  dir_name <- dirname(zipfilepath)
+  unzip(zipfilepath, files = jsonfilepath, exdir = dir_name)
+
+  # read json and create vector of just fips
+  states_info <- jsonlite::fromJSON(file.path(dir_name, jsonfilepath))
+  fips <- states_info[["state_FIPS"]]
+
+  # write.csv did not allow col.names to work
+  write.table(fips, location, sep = ",", col.names = FALSE, row.names = FALSE, quote = FALSE)
+}
diff --git a/scripts/preprocess/topo_county_boundaries.sh b/scripts/preprocess/topo_county_boundaries.sh
@@ -0,0 +1,45 @@
+#!/bin/bash  
+
+# create a temp directory
+TMP=$(mktemp -d)
+
+# unzip the geojson
+unzip $1 -d $TMP
+
+# pick out the geojson files (exclude counties.js and states.js)
+GJ=$(dir $TMP/*.geojson)
+
+# list state fips for now
+while read fip
+do
+
+  fipfixed=$(echo "$fip" | tr -d '\r')
+  path="$TMP"/"$fipfixed"
+
+  # convert to topojson
+  geo2topo \
+    state=$path.geojson \
+    -o $path.json
+
+  # simplify
+  toposimplify -s 1e-4 -f \
+    $path.json \
+    -o $path-simple.json
+
+  # quantize (store as integers, scale later)
+  topoquantize 1e5 \
+    $path-simple.json \
+    -o $path-quantized.json
+
+  echo "Finished $fipfixed"
+
+done < $2
+
+echo All done
+
+# zip back up for storage in cache/
+WD=$(pwd)
+cd "$TMP"
+zip "$WD/$3" ./*quantized.json states.json counties.json
+cd "$WD"
+
diff --git a/scripts/process/topo_county_boundaries.sh b/scripts/process/topo_county_boundaries.sh
diff --git a/scripts/publish/multiple_json_files.R b/scripts/publish/multiple_json_files.R
@@ -0,0 +1,40 @@
+publish.multiple_json_files <- function(viz) {
+  deps <- readDepends(viz)
+  args <- viz[["publish_args"]]
+  file_pattern <- args[["pattern"]]
+
+  # unzip if it's a zip file
+  if(grepl(".zip", deps[["files_location"]])) {
+    # unzip and cache in a folder before publishing
+    extract_boundary_files(deps[["files_location"]], file_pattern, viz[["location"]])
+    paths_to_use <- list.files(viz[["location"]], full.names = TRUE)
+
+  } else {
+    # paths are just the files in the passed in location if they aren't zipped
+    paths_to_use <- list.files(deps[["files_location"]], full.names = TRUE)
+  }
+
+  for(fp in paths_to_use) {
+
+    # create viz-like item to use in publish
+    viz_json <- vizlab::as.viz(list(location = fp, mimetype = "application/json"))
+
+    # use publisher to follow typical json publishing steps to get file to target
+    vizlab::publish(viz_json)
+  }
+
+}
+
+#' Extract files from a zipfile
+#'
+#' @filepath the name of the .zip file
+#' @pattern argument that represents the pattern in filenames to 
+#' extract with grep
+#' @exdir where to extract the zipfiles
+extract_boundary_files <- function(zipfile, pattern, exdir) {
+
+  allfiles <- unzip(zipfile=zipfile, list=TRUE)[["Name"]]
+  boundaryfiles <- allfiles[grep(pattern, allfiles)]
+
+  unzip(zipfile=zipfile, files=boundaryfiles, exdir=exdir, overwrite=TRUE)
+}
diff --git a/scripts/s3_bucket_setup.R b/scripts/s3_bucket_setup.R
@@ -0,0 +1,14 @@
+# This file only needs to be run one time, by one person, for the whole project.
+# Including so it's easier to create buckets again in the future.
+
+library(aws.signature)
+message('check that credentials for dev profile at ', aws.signature::default_credentials_file(), " match those in get_dssecret('dev-owi-s3-access')")
+aws.signature::use_credentials(profile='dev', file=aws.signature::default_credentials_file())
+
+library(aws.s3)
+bucketlist() # to see which buckets are already there
+new_bucket_name <- 'viz-water-use-15' # convention: 'viz-' followed by the github repo name for the vizzy
+put_bucket(new_bucket_name, region='us-west-2', acl='private') # gives error if bucket already exists
+
+# this command posted the data (took 1.5 hrs)
+put_object(file='data/nhgis0002_shape.zip', object='IPUMS_NHGIS_counties.zip', bucket='viz-water-use-15')