Skip to content

Commit

Permalink
powerbi: Add cronjob
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Sep 18, 2024
1 parent 317ba95 commit 79f4068
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 8 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
14 changes: 14 additions & 0 deletions .github/workflows/shell.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Lint Shell
on: [push, pull_request]
jobs:
build:
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: |
sudo apt update
sudo apt install devscripts shellcheck shfmt
- run: checkbashisms $(shfmt -f .)
- run: shellcheck $(shfmt -f .)
- run: shfmt -d -i 4 -sr $(shfmt -f .)
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/powerbi/kingfisher-collect
/powerbi/cardinal-rs
/powerbi/ecuador_sercop_bulk.ini
/powerbi/data
/powerbi/logs
/powerbi/tmp
/powerbi/ecuador_sercop_bulk.ini
/powerbi/scratch
4 changes: 3 additions & 1 deletion powerbi/Dockerfile_python
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ FROM python:3.10-alpine

COPY kingfisher-collect/requirements.txt /tmp/requirements.txt

RUN apk add --no-cache --virtual .build-deps build-base libffi-dev postgresql-dev && \
RUN apk add --no-cache --virtual .build-deps build-base libffi-dev libpq-dev && \
pip install --no-cache-dir -r /tmp/requirements.txt && \
apk --purge del .build-deps

RUN apk add libpq

COPY kingfisher-collect/kingfisher_scrapy kingfisher_scrapy
COPY kingfisher-collect/scrapy.cfg scrapy.cfg
COPY cardinal-rs/manage.py manage.py
22 changes: 17 additions & 5 deletions powerbi/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,29 @@ data:
logs:
mkdir -p logs

tmp:
mkdir -p tmp
scratch:
mkdir -p scratch

ecuador_sercop_bulk.ini:
curl -sSLO https://raw.githubusercontent.com/open-contracting/deploy/main/salt/kingfisher/collect/files/cardinal/ecuador_sercop_bulk.ini

filesystem: data logs tmp ecuador_sercop_bulk.ini
filesystem: data logs scratch ecuador_sercop_bulk.ini

.PHONY: print-crontab
print-crontab:
printf "15 0 * * * CARDINAL_DBNAME=$(DATABASE_NAME) CARDINAL_DBUSER=$(DATABASE_USER) CARDINAL_DBHOST=$(DATABASE_HOST) $(CARDINAL_WORKDIR)/cron.sh"
printf "15 0 * * * CARDINAL_DBNAME=$(DATABASE_NAME) CARDINAL_DBUSER=$(DATABASE_USER) CARDINAL_DBHOST=$(DATABASE_HOST) CARDINAL_DBHOST_DOCKER=$(DATABASE_HOST_DOCKER) $(CARDINAL_WORKDIR)/cron.sh"

.PHONY: clean
clean:
rm -rf kingfisher-collect
rm -rf cardinal-rs
rm -f ecuador_sercop_bulk.ini

.PHONY: force-clean
force-clean: clean
rm -rf data
rm -rf logs
rm -rf scratch

.PHONY: all
all: build database filesystem print-crontab
all: build database filesystem
20 changes: 20 additions & 0 deletions powerbi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ To print the commands that a `make` target would execute, use the `-n` (`--dry-r
make -n database
```

To run all targets (setup the database and filesystem, build the images, and install the crontab), run:

```bash
make -s
make -s print-crontab | crontab
```

## PostgreSQL

Run `make database` as a local user with the [CREATEDB](https://www.postgresql.org/docs/current/sql-createrole.html) privilege (for example, as the `postgres` user):
Expand All @@ -25,6 +32,19 @@ This will:
- Create the `ecuador_sercop_bulk_result` table, owned by the user, if it doesn't exist
- Create (or re-create) the `codelist`, `indicator` and `cpc` tables, owned by the user

## Filesystem

Run `make filesystem` from the working directory for the project.

```bash
make -s filesystem
```

This will:

- Create `data`, `logs` and `scratch` directories
- Download Cardinal's settings file to `ecuador_sercop_bulk.ini`

## Docker

Run `make build` to build two images:
Expand Down
1 change: 1 addition & 0 deletions powerbi/config.mk
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
DATABASE_NAME=cardinal
DATABASE_USER=cardinal
DATABASE_HOST=localhost
DATABASE_HOST_DOCKER=host.docker.internal
CARDINAL_WORKDIR=/absolute/path/workdir
62 changes: 62 additions & 0 deletions powerbi/cron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/sh

set -eu

WORKDIR=$(dirname "$0")
: "${CARDINAL_DBNAME:=cardinal}"
: "${CARDINAL_DBUSER:=cardinal}"
: "${CARDINAL_DBHOST:=localhost}"
: "${CARDINAL_DBHOST_DOCKER:=host.docker.internal}"

docker run -v "$WORKDIR:/workdir" --rm --name kingfisher-collect --add-host=host.docker.internal:host-gateway kingfisher-collect \
scrapy crawl ecuador_sercop_bulk \
-a crawl_time=2015-01-01T00:00:00 -a from_date=2024-09 \
-s "FILES_STORE=/workdir/data" \
-s "DATABASE_URL=postgresql://$CARDINAL_DBUSER@$CARDINAL_DBHOST_DOCKER:5432/$CARDINAL_DBNAME" \
--logfile="/workdir/logs/ecuador_sercop_bulk-$(date +%F).log"

psql -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -t \
-c 'SELECT data FROM ecuador_sercop_bulk' \
-o "$WORKDIR/scratch/ecuador_sercop_bulk.jsonl"

docker run -v "$WORKDIR:/workdir" --rm --name cardinal-rs cardinal-rs \
prepare \
-s /workdir/ecuador_sercop_bulk.ini \
-o /workdir/scratch/ecuador_sercop_bulk.out.jsonl \
-e /workdir/scratch/ecuador_sercop_bulk.err.csv \
/workdir/scratch/ecuador_sercop_bulk.jsonl

if [ -s "$WORKDIR/scratch/ecuador_sercop_bulk.err.csv" ]; then
echo "$WORKDIR/scratch/ecuador_sercop_bulk.jsonl contains new errors"
exit 1
fi

docker run -v "$WORKDIR:/workdir" --rm --name cardinal-rs cardinal-rs \
indicators \
-s /workdir/ecuador_sercop_bulk.ini \
--map \
/workdir/scratch/ecuador_sercop_bulk.out.jsonl \
> $WORKDIR/scratch/ecuador_sercop_bulk.json

# This appends to the CSV file, to keep flags consistent over time. Delete it manually if results are incorrect.
docker run -v "$WORKDIR:/workdir" --rm --name kingfisher-collect kingfisher-collect \
python manage.py json-to-csv \
-q /workdir/scratch/ecuador_sercop_bulk.json \
/workdir/scratch/ecuador_sercop_bulk.csv

psql -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -q \
-c "BEGIN" \
-c "DROP TABLE IF EXISTS ecuador_sercop_bulk_clean" \
-c "CREATE TABLE ecuador_sercop_bulk_clean (data jsonb)" \
-c "\copy ecuador_sercop_bulk_clean (data) from stdin csv quote e'\x01' delimiter e'\x02'" \
-c "CREATE INDEX idx_ecuador_sercop_bulk_clean ON ecuador_sercop_bulk_clean (cast(data->>'date' as text))" \
-c "END" \
< "$WORKDIR/scratch/ecuador_sercop_bulk.out.jsonl"

psql -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -q \
-c "BEGIN" \
-c "DROP TABLE IF EXISTS ecuador_sercop_bulk_result" \
-c "CREATE TABLE IF NOT EXISTS ecuador_sercop_bulk_result (id serial PRIMARY KEY, ocid text, subject text, code text, result numeric, buyer_id text, procuring_entity_id text, tenderer_id text, created_at timestamp without time zone)" \
-c "\copy ecuador_sercop_bulk_result (ocid, subject, code, result, buyer_id, procuring_entity_id, tenderer_id, created_at) from stdin csv header" \
-c "END" \
< "$WORKDIR/scratch/ecuador_sercop_bulk.csv"

0 comments on commit 79f4068

Please sign in to comment.