diff --git a/Dockerfile b/Dockerfile index e8681137..6a7bcb64 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ RUN npm install # copy code from local checkout ADD . ${WORKDIR} -ENV WOF_DIR '/data/whosonfirst/data' +ENV WOF_DIR '/data/whosonfirst/sqlite' ENV PLACEHOLDER_DATA '/data/placeholder' USER pelias diff --git a/cmd/download_extract.sh b/cmd/download_extract.sh deleted file mode 100755 index 832e435d..00000000 --- a/cmd/download_extract.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# directory of this file -DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) - -# placetypes to download and extract -PLACETYPES=( 'neighbourhood' 'macrohood' 'borough' 'locality' 'localadmin' 'county' 'macrocounty' 'region' - 'macroregion' 'disputed' 'dependency' 'country' 'empire' 'marinearea' 'continent' 'ocean' ) - -# download and extract fields from contents of tar -function extract { - curl -so "/tmp/wof-${1}-latest-bundle.tar.bz2" "https://whosonfirst.mapzen.com/bundles/wof-${1}-latest-bundle.tar.bz2" - if hash lbzip2 2>/dev/null; then - tar --wildcards '*.geojson' -x --use-compress-program=lbzip2 --to-command 'jq -cMf "${DIR}/jq.filter"' -f "/tmp/wof-${1}-latest-bundle.tar.bz2" - else - tar --wildcards '*.geojson' -jx --to-command 'jq -cMf "${DIR}/jq.filter"' -f "/tmp/wof-${1}-latest-bundle.tar.bz2" - fi - rc=$?; if [[ $rc != 0 ]]; then - >&2 echo "/tmp/wof-${1}-latest-bundle.tar.bz2" - >&2 echo "command exited with status: $rc" - fi -} - -# export variables required by the 'extract' function -export -f extract -export DIR - -# run the import -parallel \ - --no-notice \ - --line-buffer \ - --jobs -1 \ - extract \ - ::: "${PLACETYPES[@]}" diff --git a/cmd/extract.sh b/cmd/extract.sh index 2816b56d..7c8ebade 100755 --- a/cmd/extract.sh +++ b/cmd/extract.sh @@ -8,6 +8,6 @@ mkdir -p ${PLACEHOLDER_DATA}; echo "Creating extract at ${PLACEHOLDER_DATA}/wof.extract" -${DIR}/wof_extract.sh > ${PLACEHOLDER_DATA}/wof.extract; +exec node --max_old_space_size=8000 ${DIR}/wof_extract_sqlite.js > ${PLACEHOLDER_DATA}/wof.extract; echo 'Done!' diff --git a/cmd/wof_extract.sh b/cmd/wof_extract.sh deleted file mode 100755 index 99ce35de..00000000 --- a/cmd/wof_extract.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# directory of this file -DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ); - -# location of whosonfirst data dir -# note: set WOF_DIR env var to override -WOF_DIR=${WOF_DIR:-'/data/whosonfirst-data/data'}; - -# requires command: jq - Command-line JSON processor -# on ubuntu: sudo apt-get install jq - -# requires version jq 1.5 or later, for older versions of ubunutu: -# sudo apt-get remove jq -# sudo apt-get install libonig2 -# wget http://de.archive.ubuntu.com/ubuntu/pool/universe/j/jq/jq_1.5+dfsg-1_amd64.deb -# sudo dpkg -i jq_1.5+dfsg-1_amd64.deb - -# ensure jq exists and is executable -JQ_BIN=$(which jq) || true -if [[ ! -f "${JQ_BIN}" || ! -x "${JQ_BIN}" ]]; then - echo "jq binary not found or is not executable" 1>&2; - exit 1; -fi - -# parellize execution on systems which support it -XARGS_CMD='xargs'; -PARALLEL_BIN=$(which parallel) || true -if [[ -f "${PARALLEL_BIN}" || -x "${PARALLEL_BIN}" ]]; then - echo "info: using parallel execution" 1>&2; - XARGS_CMD='parallel --no-notice --group --keep-order --jobs +0'; -fi - -# filter records by placetype -# removing any file names from the stream whose body does not match the pattern -function placetypeFilter { - while IFS= read -r FILENAME; do - grep --files-with-match -f "${DIR}/placetype.filter" "${FILENAME}" || true; - done -} - -# extract only the json properies from each file (eg: excluding zs:*) -# note: excludes 'alt' geometeries -find "${WOF_DIR}" -type f -name '*.geojson' |\ - grep -E '/[0-9]+\.geojson$' |\ - placetypeFilter |\ - ${XARGS_CMD} ${JQ_BIN} -c -M -f "${DIR}/jq.filter"; diff --git a/cmd/wof_extract_sqlite.js b/cmd/wof_extract_sqlite.js new file mode 100644 index 00000000..75689e53 --- /dev/null +++ b/cmd/wof_extract_sqlite.js @@ -0,0 +1,63 @@ +const path = require('path'); +const fs = require('fs'); +const whosonfirst = require('pelias-whosonfirst'); +const SQLiteStream = whosonfirst.SQLiteStream; +const through = require('through2'); +const Placeholder = require('../Placeholder'); +const combinedStream = require('combined-stream'); + +const SQLITE_REGEX = /whosonfirst-data-[a-z0-9-]+\.db$/; + +const WOF_DIR = process.env.WOF_DIR || '/data/whosonfirst-data/sqlite'; + +const layers = fs.readFileSync(path.join(__dirname, 'placetype.filter'), 'utf-8') + .replace(/^.*\(/, '') // Removes all characters before the first parenthesis + .match(/[a-z]+/g); // Get the layer list + +const jq_filter = fs.readFileSync(path.join(__dirname, 'jq.filter'), 'utf-8') + .match(/test\("(.*)"\)/g) // Get all tests + .map(s => s.replace(/^[^"]+"/, '').replace(/"[^"]+$/, '')) // Get only regex part + .map(s => new RegExp(s)); // Transform it into JS RegExp + +const output = () => { + if (process.argv.length > 2 && process.argv[2] === 'build') { + const ph = new Placeholder(); + ph.load({ reset: true }); + return through.obj((row, _, next) => { + ph.insertWofRecord(row, next); + }, done => { + console.error('populate fts...'); + ph.populate(); + console.error('optimize...'); + ph.optimize(); + console.error('close...'); + ph.close(); + done(); + }); + } else { + return through.obj((row, _, next) => { + console.log(JSON.stringify(row)); + next(); + }); + } +}; + +const sqliteStream = combinedStream.create(); +fs.readdirSync(WOF_DIR) + .filter(file => SQLITE_REGEX.test(file)) + .map(file => path.join(WOF_DIR, file)) + .forEach(dbPath => { + sqliteStream.append(next => { + next(new SQLiteStream(dbPath, SQLiteStream.findGeoJSONByPlacetype(layers))); + }); + }); + +sqliteStream + .pipe(whosonfirst.toJSONStream()) + .pipe(through.obj((row, _, next) => { + Object.keys(row.properties) + .filter(key => !jq_filter.some(regex => regex.test(key))) + .forEach(key => delete row.properties[key]); + next(null, row.properties); + })) + .pipe(output()); diff --git a/package.json b/package.json index a0a19a58..dc24dd13 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,9 @@ "lower-case": "^2.0.0", "morgan": "^1.9.0", "pelias-blacklist-stream": "^1.1.0", + "pelias-config": "^4.5.0", "pelias-logger": "^1.2.1", + "pelias-whosonfirst": "^4.0.0", "remove-accents": "^0.4.0", "require-dir": "^1.0.0", "sorted-intersect": "^0.1.4",