Speedcubing-Canada · AlexandreOndet · Nov 8, 2023 · Nov 8, 2023 · Dec 20, 2023
diff --git a/back/backend/load_db/README.md b/back/backend/load_db/README.md
@@ -0,0 +1,26 @@
+# WCA Database Download
+
+Downloading the WCA database uses enough memory that it's difficult to do from App Engine. So we use a Compute Engine VM instead.
+
+## Creating a new VM
+
+You can create a new VM at https://console.cloud.google.com/compute/instancesAdd?project=<your-project>. Most of the settings can use the defaults.
+
+- **Machine configuration**: We're currently using e2-highmem-2.
+- **Identity and API access**: Use the Compute Engine default service account.
+- **Identity and API access**: Allow full access to all Cloud APIs.
+- **Management**: Use the following Startup script:
+
+```sh
+apt-get update; apt-get upgrade -y
+cd speedcubing-canada/back
+git reset --hard HEAD
+git pull
+chmod +x backend/load_db/startup.sh
+chmod +x backend/load_db/load_db.sh
+backend/load_db/startup.sh
+```
+
+Next, SSH into the instance and follow the instructions in `vm_setup.sh`.
+
+Finally, switch to the Instance Schedule tab, and either create a new schedule or attach it to an existing one. The schedule should both start and stop the instance.
diff --git a/back/backend/load_db/cleanup.py b/back/backend/load_db/cleanup.py
@@ -0,0 +1,22 @@
+from google.cloud import ndb
+
+client = ndb.Client()
+
+# Delete classes we don't use anymore.
+with client.context():
+    for clsname in ['AppSettings',
+                    'Document',
+                    'Schedule',
+                    'ScheduleCompetition',
+                    'SchedulePerson',
+                    'ScheduleRound',
+                    'ScheduleStaff',
+                    'ScheduleStage',
+                    'ScheduleTimeBlock',
+                    'WcaExport']:
+        class MyModel(ndb.Model):
+            pass
+
+
+        MyModel.__name__ = clsname
+        ndb.delete_multi(MyModel.query().fetch(keys_only=True))
diff --git a/back/backend/load_db/delete_old_exports.py b/back/backend/load_db/delete_old_exports.py
@@ -0,0 +1,31 @@
+import shutil
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from google.cloud import ndb
+
+from backend.models.wca.export import get_latest_export
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('export_base', '', 'Base directory of exports.')
+
+client = ndb.Client()
+
+
+def main(argv):
+    with client.context():
+        latest_export = get_latest_export()
+        exports = sorted([f for f in os.listdir(FLAGS.export_base)
+                          if not os.path.isfile(os.path.join(FLAGS.export_base, f))
+                          and f != latest_export])
+
+        for export in exports[:-5]:
+            shutil.rmtree(os.path.join(FLAGS.export_base, export))
+            logging.info('Deleted %s', export)
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/back/backend/load_db/get_latest_export.py b/back/backend/load_db/get_latest_export.py
@@ -0,0 +1,10 @@
+from google.cloud import ndb
+
+from backend.models.wca.export import get_latest_export
+
+client = ndb.Client()
+
+with client.context():
+    export = get_latest_export()
+    if export:
+        print(export)
diff --git a/back/backend/load_db/load_db.py b/back/backend/load_db/load_db.py
@@ -0,0 +1,163 @@
+import csv
+
+from absl import app
+from absl import flags
+from absl import logging
+from google.cloud import ndb
+
+from backend.load_db.update_champions import update_champions
+from backend.models.user import User
+from backend.models.wca.competition import Competition
+from backend.models.wca.continent import Continent
+from backend.models.wca.country import Country
+from backend.models.wca.event import Event
+from backend.models.wca.export import set_latest_export
+from backend.models.wca.format import Format
+from backend.models.wca.person import Person
+from backend.models.wca.rank import RankAverage
+from backend.models.wca.rank import RankSingle
+from backend.models.wca.result import Result
+from backend.models.wca.round import RoundType
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('old_export_id', '', 'ID of the old export.')
+flags.DEFINE_string('new_export_id', '', 'ID of the new export.')
+flags.DEFINE_string('export_base', '', 'Base directory of exports.')
+
+
+def get_tables():
+    return [('Continents', Continent),
+            ('Countries', Country),
+            ('Events', Event),
+            ('Formats', Format),
+            ('RoundTypes', RoundType),
+            ('Persons', Person),
+            ('RanksSingle', RankSingle),
+            ('RanksAverage', RankAverage),
+            ('Competitions', Competition),
+            ('Results', Result),
+            ]
+
+
+# Ideally this would live in person.py, but that would be a circular dependency
+# between Person and User.
+def get_modifier(table):
+    if table == 'Persons':
+        id_to_province = {}
+        for user in User.query(User.province != None):
+            if user.wca_person:
+                id_to_province[user.wca_person.id()] = user.province
+
+        def modify(person):
+            if person.key.id() in id_to_province:
+                person.province = id_to_province[person.key.id()]
+
+        return modify
+    return None
+
+
+def read_table(path, cls, apply_filter):
+    filter_fn = lambda row: True
+    if apply_filter:
+        filter_fn = cls.filter()
+    out = {}
+    try:
+        with open(path) as csvfile:
+            reader = csv.DictReader(csvfile, dialect='excel-tab')
+            for row in reader:
+                if filter_fn(row):
+                    fields_to_write = cls.columns_used()
+                    if 'id' in row:
+                        fields_to_write += ['id']
+                    to_write = {}
+                    for field in fields_to_write:
+                        if field in row:
+                            to_write[field] = row[field]
+                    out[cls.get_id(row)] = to_write
+    except:
+        # This is fine, the file might just not exist.
+        pass
+    return out
+
+
+def write_table(path, rows, cls):
+    use_id = False
+    with open(path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile, dialect='excel-tab')
+        use_id = 'id' in reader.fieldnames
+    with open(path, 'w') as csvfile:
+        fields_to_write = cls.columns_used()
+        if use_id:
+            fields_to_write += ['id']
+        writer = csv.DictWriter(csvfile, dialect='excel-tab', fieldnames=fields_to_write)
+        writer.writeheader()
+        for row in rows.items():
+            writer.writerow({k: v for k, v in row[1].items() if k in fields_to_write})
+
+
+def process_export(old_export_path, new_export_path):
+    client = ndb.Client()
+    for table, cls in get_tables():
+        logging.info('Processing ' + table)
+        table_suffix = '/WCA_export_' + table + '.tsv'
+        with client.context():
+            old_rows = read_table(old_export_path + table_suffix, cls, False)
+            logging.info('Old: %d' % len(old_rows))
+            new_rows = read_table(new_export_path + table_suffix, cls, True)
+            logging.info('New: %d' % len(new_rows))
+            write_table(new_export_path + table_suffix, new_rows, cls)
+
+            objects_to_put = []
+            keys_to_delete = []
+
+            modifier = get_modifier(table)
+            for key in new_rows:
+                row = new_rows[key]
+                if key in old_rows and old_rows[key] == row:
+                    continue
+                else:
+                    obj = cls(id=key)
+                    obj.parse_from_dict(row)
+                    if modifier:
+                        modifier(obj)
+                    objects_to_put += [obj]
+            for key, row in old_rows.items():
+                if key in new_rows:
+                    continue
+                else:
+                    keys_to_delete += [ndb.Key(cls, key)]
+
+        logging.info('Putting %d objects' % len(objects_to_put))
+        while objects_to_put:
+            batch_size = 5000
+            logging.info('%d left' % len(objects_to_put))
+            subslice = objects_to_put[:batch_size]
+            objects_to_put = objects_to_put[batch_size:]
+            with client.context():
+                ndb.put_multi(subslice)
+
+        logging.info('Deleting %d objects' % len(keys_to_delete))
+        client = ndb.Client()
+        with client.context():
+            ndb.delete_multi(keys_to_delete)
+
+
+def main(argv):
+    old_export_path = FLAGS.export_base + FLAGS.old_export_id
+    new_export_path = FLAGS.export_base + FLAGS.new_export_id
+
+    logging.info(old_export_path)
+    logging.info(new_export_path)
+
+    # A new client context is created for each write here, to avoid a memory leak.
+    process_export(old_export_path, new_export_path)
+
+    client = ndb.Client()
+    with client.context():
+        set_latest_export(FLAGS.new_export_id)
+        update_champions()
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/back/backend/load_db/load_db.sh b/back/backend/load_db/load_db.sh
@@ -0,0 +1,46 @@
+set -e
+
+export PYTHONPATH=$(pwd)
+
+if [ "$SCC_ENV" != "COMPUTE_ENGINE" ]
+then
+  echo "Emulating datastore."
+  #$(gcloud beta emulators datastore env-init)
+fi
+
+echo "Deleting old exports"
+python3 backend/load_db/delete_old_exports.py \
+    --export_base=exports/
+echo "Done deleting old exports"
+
+SAVED_EXPORT=$(python3 backend/load_db/get_latest_export.py)
+LATEST_EXPORT=$(curl https://www.worldcubeassociation.org/export/results \
+| grep TSV:.*WCA_export \
+| sed -s 's/.*\(WCA_export[0-9A-Za-z_]*\).tsv.zip.*/\1/')
+
+if [ "$SAVED_EXPORT" == "$LATEST_EXPORT" ]
+then
+  echo "Already have latest export $LATEST_EXPORT; returning."
+fi
+
+if [ "$SAVED_EXPORT" != "$LATEST_EXPORT" ]
+then
+  echo "Downloading $LATEST_EXPORT"
+  URL_TO_FETCH="https://www.worldcubeassociation.org/export/results/$LATEST_EXPORT.tsv.zip"
+  EXPORT_DIR="exports/$LATEST_EXPORT"
+  mkdir -p exports/
+  rm -rf ./$EXPORT_DIR
+  mkdir $EXPORT_DIR
+  ZIP_FILE="$EXPORT_DIR/$LATEST_EXPORT.sql.zip"
+
+  curl $URL_TO_FETCH > $ZIP_FILE
+  unzip $ZIP_FILE -d $EXPORT_DIR
+  rm $ZIP_FILE
+
+  python3 backend/load_db/load_db.py \
+      --old_export_id="$SAVED_EXPORT" \
+      --new_export_id="$LATEST_EXPORT" \
+      --export_base=exports/
+fi
+
+/usr/sbin/shutdown -h now
diff --git a/back/backend/load_db/startup.sh b/back/backend/load_db/startup.sh
@@ -0,0 +1,10 @@
+# Script run on VM startup.
+# This is a wrapper for load_db.sh; most of the logic should be in there.
+# This is not recommended for running locally, since it sets the environment
+# to COMPUTE_ENGINE.
+
+source env/bin/activate
+source /root/.bashrc
+pip3 install -r requirements.txt
+
+SCC_ENV=COMPUTE_ENGINE ./backend/load_db/load_db.sh