Skip to content

Commit

Permalink
powerbi: Set password for project database user. Cleanup temporary fi…
Browse files Browse the repository at this point in the history
…les. Fix Docker networking.

Makefile:
- Add env.list target to download env.list
- Update setup target to run env.list target and set file permissions
- Update createuser target to set password

cron.sh:
- Use DATABASE_HOST to add host to container
- Use DATABASE_PASSWORD to connect to database
- Read config.mk and env.list into kingfisher-collect container
- Remove temporary files if CARDINAL_DEBUG unset

config.mk:
- Add DATABASE_PORT and use in Makefile and cron.sh to connect to database
- Update MAINTENANCE_DATABASE_USER to default "postgres"
- Remove DATABASE_HOST_DOCKER

env.list:
- Add DATABASE_PASSWORD
- Add CARDINAL_DEBUG

docs:
- Document network access for Docker container
  • Loading branch information
jpmckinney committed Sep 20, 2024
1 parent 3786258 commit e130acd
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 33 deletions.
27 changes: 18 additions & 9 deletions powerbi/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,21 @@ FORCE:

include config.mk

DATABASE_URL=postgresql://$(DATABASE_USER)@$(DATABASE_HOST):$(DATABASE_PORT)/$(DATABASE_NAME)
MAINTENANCE_DATABASE_URL=postgresql://$(MAINTENANCE_DATABASE_USER)@$(DATABASE_HOST):$(DATABASE_PORT)/$(MAINTENANCE_DATABASE_NAME)

config.mk:
curl -sSLO https://raw.githubusercontent.com/open-contracting/bi.open-contracting.org/refs/heads/main/powerbi/config.mk

cron.sh:
curl -sSLO https://github.com/open-contracting/bi.open-contracting.org/raw/refs/heads/main/powerbi/cron.sh

env.list:
curl -sSLO https://github.com/open-contracting/bi.open-contracting.org/raw/refs/heads/main/powerbi/env.list

.PHONY: setup
setup: config.mk cron.sh
setup: config.mk cron.sh env.list
chmod go-rwx env.list

kingfisher-collect:
git clone https://github.com/open-contracting/kingfisher-collect.git
Expand Down Expand Up @@ -53,41 +60,43 @@ build: build-python build-cardinal

.PHONY: createdb
createdb:
psql $(MAINTENANCE_DATABASE_NAME) -U $(MAINTENANCE_DATABASE_USER) -h $(DATABASE_HOST) \
psql $(MAINTENANCE_DATABASE_URL) \
<<<"SELECT 'CREATE DATABASE $(DATABASE_NAME)' WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = '$(DATABASE_NAME)')\gexec"

.PHONY: createuser
createuser:
psql $(MAINTENANCE_DATABASE_NAME) -U $(MAINTENANCE_DATABASE_USER) -h $(DATABASE_HOST) \
<<<"SELECT 'CREATE USER $(DATABASE_USER)' WHERE NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '$(DATABASE_USER)')\gexec"
psql $(MAINTENANCE_DATABASE_URL) \
-c "SELECT 1/(SELECT COUNT(*) FROM pg_catalog.pg_roles WHERE rolname = '$(DATABASE_USER)')" &>/dev/null \
&& echo "role '$(DATABASE_USER)' already exists" \
|| createuser $(DATABASE_USER) -P

.PHONY: codelist
codelist:
curl -sSL https://raw.githubusercontent.com/open-contracting/deploy/main/salt/kingfisher/collect/files/data/codelist.csv | \
psql $(DATABASE_NAME) -U $(DATABASE_USER) -h $(DATABASE_HOST) -v ON_ERROR_STOP=1 \
psql "$(DATABASE_URL)" -v ON_ERROR_STOP=1 \
-c 'DROP TABLE IF EXISTS codelist' \
-c 'CREATE TABLE codelist (id serial PRIMARY KEY, codelist text, code text, code_es text, UNIQUE (codelist, code))' \
-c '\copy codelist (codelist, code, code_es) from stdin csv header'

.PHONY: cpc
cpc:
curl -sSL https://raw.githubusercontent.com/open-contracting/deploy/main/salt/kingfisher/collect/files/data/cpc.csv | \
psql $(DATABASE_NAME) -U $(DATABASE_USER) -h $(DATABASE_HOST) -v ON_ERROR_STOP=1 \
psql "$(DATABASE_URL)" -v ON_ERROR_STOP=1 \
-c 'DROP TABLE IF EXISTS cpc' \
-c 'CREATE TABLE cpc (id serial PRIMARY KEY, code text UNIQUE, description text, description_es text)' \
-c '\copy cpc (code, description, description_es) from stdin csv header'

.PHONY: indicator
indicator:
curl -sSL https://raw.githubusercontent.com/open-contracting/deploy/main/salt/kingfisher/collect/files/data/indicator.csv | \
psql $(DATABASE_NAME) -U $(DATABASE_USER) -h $(DATABASE_HOST) -v ON_ERROR_STOP=1 \
psql "$(DATABASE_URL)" -v ON_ERROR_STOP=1 \
-c 'DROP TABLE IF EXISTS indicator' \
-c 'CREATE TABLE indicator (id serial PRIMARY KEY, code text UNIQUE, category text, title text, description text, category_es text, title_es text, description_es text)' \
-c '\copy indicator (code, category, title, description, category_es, title_es, description_es) from stdin csv header'

.PHONY: ecuador_sercop_bulk_result
ecuador_sercop_bulk_result:
psql $(DATABASE_NAME) -U $(DATABASE_USER) -h $(DATABASE_HOST) \
psql "$(DATABASE_URL)" \
-c 'CREATE TABLE IF NOT EXISTS ecuador_sercop_bulk_result (id serial PRIMARY KEY, ocid text, subject text, code text, result numeric, buyer_id text, procuring_entity_id text, tenderer_id text, created_at timestamp without time zone)'

.PHONY: tables
Expand All @@ -109,7 +118,7 @@ filesystem: data logs scratch ecuador_sercop_bulk.ini

.PHONY: print-crontab
print-crontab: cron.sh
printf "15 0 * * * CARDINAL_DBNAME=$(DATABASE_NAME) CARDINAL_DBUSER=$(DATABASE_USER) CARDINAL_DBHOST=$(DATABASE_HOST) CARDINAL_DBHOST_DOCKER=$(DATABASE_HOST_DOCKER) $(CARDINAL_WORKDIR)/cron.sh\n"
printf "15 0 * * * $(CARDINAL_WORKDIR)/cron.sh\n"

This comment has been minimized.

Copy link
@yolile

yolile Sep 23, 2024

Member

Without the variables set as part of the command this is now complaining that "DATABASE_USER: parameter not set"


.PHONY: clean-build
clean-build:
Expand Down
32 changes: 23 additions & 9 deletions powerbi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ Download the [`Makefile`](Makefile) to the current directory:
curl -sSLO https://raw.githubusercontent.com/open-contracting/bi.open-contracting.org/refs/heads/main/powerbi/Makefile
```

Download the [`config.mk`](config.mk) and [`cron.sh`](cron.sh) files to the current directory, if they don't exist:
Download the [`config.mk`](config.mk), [`cron.sh`](cron.sh) and [`env.list`](env.list) files to the current directory, if they don't exist, and restrict permissions to the `env.list` file (`chmod go-rwx`):

```bash
make setup
```

Lastly, edit the `config.mk` file, as needed.
Lastly, edit the `config.mk` and `env.list` files. At minimum, set the `DATABASE_PASSWORD` setting in the `env.list` file to a [strong password](https://www.lastpass.com/features/password-generator).

## Database
## Database (PostgreSQL)

These commands connect to the PostgreSQL host set by the `DATABASE_HOST` setting, by default `localhost`.
These commands connect to the database server set by the `DATABASE_HOST` setting, by default `localhost`, on the port set by the `DATABASE_PORT` setting, by default `5432`.

### Create database and user

This step requires a PostgreSQL **maintenance database user** (`MAINTENANCE_DATABASE_USER` setting, by default the name of the current operating system user) with the privileges:
This step requires a **maintenance database user** (`MAINTENANCE_DATABASE_USER` setting, by default `postgres`) with the privileges:

- [`CREATEDB`](https://www.postgresql.org/docs/current/sql-createrole.html) database privilege
- `CREATEROLE` database privilege
Expand All @@ -60,10 +60,12 @@ Run `make -s createdb createuser` to:
- Create the **project database** (`DATABASE_NAME` setting, by default `cardinal`), owned by the **maintenance database user**, if it doesn't exist
- Create the **project database user** (`DATABASE_USER` setting, by default `cardinal`), if it doesn't exist

It will prompt for the project database user's password. Enter the same password as the `DATABASE_PASSWORD` setting.

This must be run:

- by any operating system user,
- from any directory containing the `Makefile` and `config.mk` files,
- from any directory in which the user can read the `Makefile` and `config.mk` files,
- to which the operating system user has read and execute permissions.

The simplest option is to run this command as the `postgres` operating system user, which has the necessary privileges.
Expand All @@ -87,7 +89,7 @@ Run `make -s tables` to:
This must be run:

- by any operating system user,
- from any directory containing the `Makefile` and `config.mk` files,
- from any directory in which the user can read the `Makefile` and `config.mk` files,
- to which the operating system user has read and execute permissions.

This command requires you to authenticate as the **project database user**. Either enter the password when prompted, or, to skip the password prompt:
Expand All @@ -111,7 +113,7 @@ Run `make build` to:
This must be run:

- by any operating system user,
- from any directory containing the `Makefile`,
- from any directory in which the user can read the `Makefile`,
- to which the user has read, write and execute permissions.

This command requires you to have write permission to the [Docker daemon's Unix socket](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user), which is owned by the `root` user and `docker` group. Either run the command with `sudo`, or add the operating system user to the `docker` group.
Expand Down Expand Up @@ -143,6 +145,18 @@ This must be run:

## Cron

The [`cron.sh` script](cron.sh) creates a container from the [`kingfisher-collect` image](#docker). This container needs network access to the database server. If the database server is running on the same machine as the cron job, then the simplest option is to:

- Set `listen_addresses = '*'`, either in the [`postgresql.conf`](https://www.postgresql.org/docs/current/config-setting.html) file or in a configuration file under the `conf.d` directory
- Configure the [`pg_hba.conf`](https://www.postgresql.org/docs/current/auth-g-hba-conf.html) file to allow connections from the [IP addresses](https://docs.docker.com/engine/network/#ip-address-and-hostname) allocated by the Docker daemon. For example:

```none
hostssl all all 0.0.0.0/0 scram-sha-256
hostssl all all ::/0 scram-sha-256
```

This assumes that an external firewall closes the port of the database server to external connections.

Preview the crontab entry, to make sure the directory of the `cron.sh` script is correct (if not, edit the `CARDINAL_WORKDIR` setting):

```bash
Expand All @@ -158,7 +172,7 @@ make -s print-crontab | crontab
This must be run:

- by the operating system user that will run the cron job,
- from any directory containing the `Makefile` and `config.mk` files.
- from any directory in which the user can read the `Makefile`, `config.mk` and `env.list` files.

## Clean

Expand Down
14 changes: 8 additions & 6 deletions powerbi/config.mk
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# The host of the project's database.
# The project's working directory.
CARDINAL_WORKDIR=/home/cardinal

# The host of the database server.
DATABASE_HOST=localhost
# The port of the database server.
DATABASE_PORT=5432
# The name of the project's database.
DATABASE_NAME=cardinal
# The user to update to the project's database.
DATABASE_USER=cardinal

# The user to create the DATABASE_NAME and DATABASE_USER.
MAINTENANCE_DATABASE_USER=$(whoami)
MAINTENANCE_DATABASE_USER=postgres
# The database to which the DATABASE_USER_MAINTENANCE connects.
MAINTENANCE_DATABASE_NAME=postgres
# The host of the project's database, from within the Docker container.
DATABASE_HOST_DOCKER=host.docker.internal
# The project's working directory.
CARDINAL_WORKDIR=/home/cardinal
36 changes: 27 additions & 9 deletions powerbi/cron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,37 @@
set -eu

WORKDIR=$(dirname "$0")
: "${CARDINAL_DBNAME:=cardinal}"
: "${CARDINAL_DBUSER:=cardinal}"
: "${CARDINAL_DBHOST:=localhost}"
: "${CARDINAL_DBHOST_DOCKER:=host.docker.internal}"

docker run -v "$WORKDIR:/workdir" --rm --name kingfisher-collect --add-host=host.docker.internal:host-gateway kingfisher-collect \
DATABASE_URL="postgresql://$DATABASE_USER:$DATABASE_PASSWORD@$DATABASE_HOST:$DATABASE_PORT/$DATABASE_NAME"
if [ "$DATABASE_HOST" = "localhost" ]; then ADD_HOST=host-gateway; else ADD_HOST="$DATABASE_HOST"; fi

docker run -v "$WORKDIR:/workdir" --rm --name kingfisher-collect \
--add-host=postgres:"$ADD_HOST" \
--env-file config.mk \
--env-file env.list \
kingfisher-collect \
scrapy crawl ecuador_sercop_bulk \
-a crawl_time=2015-01-01T00:00:00 \
-s "FILES_STORE=/workdir/data" \
-s "DATABASE_URL=postgresql://$CARDINAL_DBUSER@$CARDINAL_DBHOST_DOCKER:5432/$CARDINAL_DBNAME" \
-s "DATABASE_URL=postgresql://$DATABASE_USER:$DATABASE_PASSWORD@postgres:$DATABASE_PORT/$DATABASE_NAME" \
--logfile="/workdir/logs/ecuador_sercop_bulk-$(date +%F).log"

psql "$CARDINAL_DBNAME" -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -t \
psql "$DATABASE_URL" -t \
-c 'SELECT data FROM ecuador_sercop_bulk' \
-o "$WORKDIR/scratch/ecuador_sercop_bulk.jsonl"
if [ -z "$CARDINAL_DEBUG" ]; then
psql "$DATABASE_URL" -q -c 'DROP TABLE ecuador_sercop_bulk'
fi

docker run -v "$WORKDIR:/workdir" --rm --name cardinal-rs cardinal-rs \
prepare \
-s /workdir/ecuador_sercop_bulk.ini \
-o /workdir/scratch/ecuador_sercop_bulk.out.jsonl \
-e /workdir/scratch/ecuador_sercop_bulk.err.csv \
/workdir/scratch/ecuador_sercop_bulk.jsonl
if [ -z "$CARDINAL_DEBUG" ]; then
rm -f "$WORKDIR/scratch/ecuador_sercop_bulk.jsonl"
fi

if [ -s "$WORKDIR/scratch/ecuador_sercop_bulk.err.csv" ]; then
echo "$WORKDIR/scratch/ecuador_sercop_bulk.err.csv contains new errors"
Expand All @@ -43,20 +52,29 @@ docker run -v "$WORKDIR:/workdir" --rm --name kingfisher-collect kingfisher-coll
python manage.py json-to-csv \
-q /workdir/scratch/ecuador_sercop_bulk.json \
/workdir/scratch/ecuador_sercop_bulk.csv
if [ -z "$CARDINAL_DEBUG" ]; then
rm -f "$WORKDIR/scratch/ecuador_sercop_bulk.json"
fi

psql "$CARDINAL_DBNAME" -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -q \
psql "$DATABASE_URL" -q \
-c "BEGIN" \
-c "DROP TABLE IF EXISTS ecuador_sercop_bulk_clean" \
-c "CREATE TABLE ecuador_sercop_bulk_clean (data jsonb)" \
-c "\copy ecuador_sercop_bulk_clean (data) from stdin csv quote e'\x01' delimiter e'\x02'" \
-c "CREATE INDEX idx_ecuador_sercop_bulk_clean ON ecuador_sercop_bulk_clean (cast(data->>'date' as text))" \
-c "END" \
< "$WORKDIR/scratch/ecuador_sercop_bulk.out.jsonl"
if [ -z "$CARDINAL_DEBUG" ]; then
rm -f "$WORKDIR/scratch/ecuador_sercop_bulk.out.jsonl"
fi

psql "$CARDINAL_DBNAME" -U "$CARDINAL_DBUSER" -h "$CARDINAL_DBHOST" -q \
psql "$DATABASE_URL" -q \
-c "BEGIN" \
-c "DROP TABLE IF EXISTS ecuador_sercop_bulk_result" \
-c "CREATE TABLE IF NOT EXISTS ecuador_sercop_bulk_result (id serial PRIMARY KEY, ocid text, subject text, code text, result numeric, buyer_id text, procuring_entity_id text, tenderer_id text, created_at timestamp without time zone)" \
-c "\copy ecuador_sercop_bulk_result (ocid, subject, code, result, buyer_id, procuring_entity_id, tenderer_id, created_at) from stdin csv header" \
-c "END" \
< "$WORKDIR/scratch/ecuador_sercop_bulk.csv"
if [ -z "$CARDINAL_DEBUG" ]; then
rm -f "$WORKDIR/scratch/ecuador_sercop_bulk.csv"
fi
5 changes: 5 additions & 0 deletions powerbi/env.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Set the password of the project database user.
DATABASE_PASSWORD=

# If non-empty, temporary files aren't deleted. (For debugging.)
CARDINAL_DEBUG=

0 comments on commit e130acd

Please sign in to comment.