Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Part 3: Load db #72

Open
wants to merge 3 commits into
base: 2_back
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions back/backend/load_db/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# WCA Database Download

Downloading the WCA database uses enough memory that it's difficult to do from App Engine. So we use a Compute Engine VM instead.

## Creating a new VM

You can create a new VM at https://console.cloud.google.com/compute/instancesAdd?project=<your-project>. Most of the settings can use the defaults.

- **Machine configuration**: We're currently using e2-highmem-2.
- **Identity and API access**: Use the Compute Engine default service account.
- **Identity and API access**: Allow full access to all Cloud APIs.
- **Management**: Use the following Startup script:

```sh
apt-get update; apt-get upgrade -y
cd speedcubing-canada/back
git reset --hard HEAD
git pull
chmod +x backend/load_db/startup.sh
chmod +x backend/load_db/load_db.sh
backend/load_db/startup.sh
```

Next, SSH into the instance and follow the instructions in `vm_setup.sh`.

Finally, switch to the Instance Schedule tab, and either create a new schedule or attach it to an existing one. The schedule should both start and stop the instance.
22 changes: 22 additions & 0 deletions back/backend/load_db/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from google.cloud import ndb

client = ndb.Client()

# Delete classes we don't use anymore.
with client.context():
for clsname in ['AppSettings',
'Document',
'Schedule',
'ScheduleCompetition',
'SchedulePerson',
'ScheduleRound',
'ScheduleStaff',
'ScheduleStage',
'ScheduleTimeBlock',
'WcaExport']:
class MyModel(ndb.Model):
pass


MyModel.__name__ = clsname
ndb.delete_multi(MyModel.query().fetch(keys_only=True))
31 changes: 31 additions & 0 deletions back/backend/load_db/delete_old_exports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import shutil
import os

from absl import app
from absl import flags
from absl import logging
from google.cloud import ndb

from backend.models.wca.export import get_latest_export

FLAGS = flags.FLAGS

flags.DEFINE_string('export_base', '', 'Base directory of exports.')

client = ndb.Client()


def main(argv):
with client.context():
latest_export = get_latest_export()
exports = sorted([f for f in os.listdir(FLAGS.export_base)
if not os.path.isfile(os.path.join(FLAGS.export_base, f))
and f != latest_export])

for export in exports[:-5]:
shutil.rmtree(os.path.join(FLAGS.export_base, export))
logging.info('Deleted %s', export)


if __name__ == '__main__':
app.run(main)
10 changes: 10 additions & 0 deletions back/backend/load_db/get_latest_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from google.cloud import ndb

from backend.models.wca.export import get_latest_export

client = ndb.Client()

with client.context():
export = get_latest_export()
if export:
print(export)
163 changes: 163 additions & 0 deletions back/backend/load_db/load_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import csv

from absl import app
from absl import flags
from absl import logging
from google.cloud import ndb

from backend.load_db.update_champions import update_champions
from backend.models.user import User
from backend.models.wca.competition import Competition
from backend.models.wca.continent import Continent
from backend.models.wca.country import Country
from backend.models.wca.event import Event
from backend.models.wca.export import set_latest_export
from backend.models.wca.format import Format
from backend.models.wca.person import Person
from backend.models.wca.rank import RankAverage
from backend.models.wca.rank import RankSingle
from backend.models.wca.result import Result
from backend.models.wca.round import RoundType

FLAGS = flags.FLAGS

flags.DEFINE_string('old_export_id', '', 'ID of the old export.')
flags.DEFINE_string('new_export_id', '', 'ID of the new export.')
flags.DEFINE_string('export_base', '', 'Base directory of exports.')


def get_tables():
return [('Continents', Continent),
('Countries', Country),
('Events', Event),
('Formats', Format),
('RoundTypes', RoundType),
('Persons', Person),
('RanksSingle', RankSingle),
('RanksAverage', RankAverage),
('Competitions', Competition),
('Results', Result),
]


# Ideally this would live in person.py, but that would be a circular dependency
# between Person and User.
def get_modifier(table):
if table == 'Persons':
id_to_province = {}
for user in User.query(User.province != None):
if user.wca_person:
id_to_province[user.wca_person.id()] = user.province

def modify(person):
if person.key.id() in id_to_province:
person.province = id_to_province[person.key.id()]

return modify
return None


def read_table(path, cls, apply_filter):
filter_fn = lambda row: True
if apply_filter:
filter_fn = cls.filter()
out = {}
try:
with open(path) as csvfile:
reader = csv.DictReader(csvfile, dialect='excel-tab')
for row in reader:
if filter_fn(row):
fields_to_write = cls.columns_used()
if 'id' in row:
fields_to_write += ['id']
to_write = {}
for field in fields_to_write:
if field in row:
to_write[field] = row[field]
out[cls.get_id(row)] = to_write
except:
# This is fine, the file might just not exist.
pass
return out


def write_table(path, rows, cls):
use_id = False
with open(path, 'r') as csvfile:
reader = csv.DictReader(csvfile, dialect='excel-tab')
use_id = 'id' in reader.fieldnames
with open(path, 'w') as csvfile:
fields_to_write = cls.columns_used()
if use_id:
fields_to_write += ['id']
writer = csv.DictWriter(csvfile, dialect='excel-tab', fieldnames=fields_to_write)
writer.writeheader()
for row in rows.items():
writer.writerow({k: v for k, v in row[1].items() if k in fields_to_write})


def process_export(old_export_path, new_export_path):
client = ndb.Client()
for table, cls in get_tables():
logging.info('Processing ' + table)
table_suffix = '/WCA_export_' + table + '.tsv'
with client.context():
old_rows = read_table(old_export_path + table_suffix, cls, False)
logging.info('Old: %d' % len(old_rows))
new_rows = read_table(new_export_path + table_suffix, cls, True)
logging.info('New: %d' % len(new_rows))
write_table(new_export_path + table_suffix, new_rows, cls)

objects_to_put = []
keys_to_delete = []

modifier = get_modifier(table)
for key in new_rows:
row = new_rows[key]
if key in old_rows and old_rows[key] == row:
continue
else:
obj = cls(id=key)
obj.parse_from_dict(row)
if modifier:
modifier(obj)
objects_to_put += [obj]
for key, row in old_rows.items():
if key in new_rows:
continue
else:
keys_to_delete += [ndb.Key(cls, key)]

logging.info('Putting %d objects' % len(objects_to_put))
while objects_to_put:
batch_size = 5000
logging.info('%d left' % len(objects_to_put))
subslice = objects_to_put[:batch_size]
objects_to_put = objects_to_put[batch_size:]
with client.context():
ndb.put_multi(subslice)

logging.info('Deleting %d objects' % len(keys_to_delete))
client = ndb.Client()
with client.context():
ndb.delete_multi(keys_to_delete)


def main(argv):
old_export_path = FLAGS.export_base + FLAGS.old_export_id
new_export_path = FLAGS.export_base + FLAGS.new_export_id

logging.info(old_export_path)
logging.info(new_export_path)

# A new client context is created for each write here, to avoid a memory leak.
process_export(old_export_path, new_export_path)

client = ndb.Client()
with client.context():
set_latest_export(FLAGS.new_export_id)
update_champions()


if __name__ == '__main__':
app.run(main)
46 changes: 46 additions & 0 deletions back/backend/load_db/load_db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
set -e

export PYTHONPATH=$(pwd)

if [ "$SCC_ENV" != "COMPUTE_ENGINE" ]
then
echo "Emulating datastore."
#$(gcloud beta emulators datastore env-init)
fi

echo "Deleting old exports"
python3 backend/load_db/delete_old_exports.py \
--export_base=exports/
echo "Done deleting old exports"

SAVED_EXPORT=$(python3 backend/load_db/get_latest_export.py)
LATEST_EXPORT=$(curl https://www.worldcubeassociation.org/export/results \
| grep TSV:.*WCA_export \
| sed -s 's/.*\(WCA_export[0-9A-Za-z_]*\).tsv.zip.*/\1/')

if [ "$SAVED_EXPORT" == "$LATEST_EXPORT" ]
then
echo "Already have latest export $LATEST_EXPORT; returning."
fi

if [ "$SAVED_EXPORT" != "$LATEST_EXPORT" ]
then
echo "Downloading $LATEST_EXPORT"
URL_TO_FETCH="https://www.worldcubeassociation.org/export/results/$LATEST_EXPORT.tsv.zip"
EXPORT_DIR="exports/$LATEST_EXPORT"
mkdir -p exports/
rm -rf ./$EXPORT_DIR
mkdir $EXPORT_DIR
ZIP_FILE="$EXPORT_DIR/$LATEST_EXPORT.sql.zip"

curl $URL_TO_FETCH > $ZIP_FILE
unzip $ZIP_FILE -d $EXPORT_DIR
rm $ZIP_FILE

python3 backend/load_db/load_db.py \
--old_export_id="$SAVED_EXPORT" \
--new_export_id="$LATEST_EXPORT" \
--export_base=exports/
fi

/usr/sbin/shutdown -h now
10 changes: 10 additions & 0 deletions back/backend/load_db/startup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Script run on VM startup.
# This is a wrapper for load_db.sh; most of the logic should be in there.
# This is not recommended for running locally, since it sets the environment
# to COMPUTE_ENGINE.

source env/bin/activate
source /root/.bashrc
pip3 install -r requirements.txt

SCC_ENV=COMPUTE_ENGINE ./backend/load_db/load_db.sh
Loading