split out metadata locations and allow dev and prod deploys

censoredplanet · Jan 10, 2023 · 41f91f4 · 41f91f4
1 parent f8c8b84
commit 41f91f4
Show file tree

Hide file tree

Showing 12 changed files with 169 additions and 54 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -18,11 +18,14 @@ FROM python:3.8-buster
 # Allow statements and log messages to immediately appear in the Knative logs
 ENV PYTHONUNBUFFERED True
 
+# Define env variable to be passed in by deploy.sh
+ENV PIPELINE_ENV change_me
+
 # Copy local code to the container image.
 ENV APP_HOME /app
 WORKDIR $APP_HOME
 COPY . ./
 
 RUN pip install -r requirements.txt
 
-ENTRYPOINT python3 schedule_pipeline.py
+ENTRYPOINT python3 schedule_pipeline.py --env=${PIPELINE_ENV}
diff --git a/deploy.sh b/deploy.sh
@@ -16,28 +16,47 @@
 
 # to run
 #
-# ./deploy prod
+# ./deploy.sh deploy prod
 # to deploy to production
 #
-# ./deploy delete
+# ./deploy.sh delete prod
 # to delete the prod instance (usually because you have changed
 # some instance boot value and want to recreate it from scratch)
 
 # exit on error
 set -e
 
 action=$1
+env=$2
 
-# CHANGE THESE VALUES TO MATCH YOUR PROJECT
 # String id of a google cloud project
-project="firehook-censoredplanet"
+dev_project="firehook-censoredplanet"
 # Int id of a cloud service account with the correct access permissions
-service_account_id="654632410498"
+dev_service_account_id="654632410498"
+
+# String id of a google cloud project
+prod_project="censoredplanet-analysisv1"
+# Int id of a cloud service account with the correct access permissions
+prod_service_account_id="669508427087"
+
 # GCP zone to deploy to
 zone="us-east1-b"
 
+if [[ "${env}" == "dev" ]]; then
+  project="${dev_project}"
+  service_account_id="${dev_service_account_id}"
+  env_file="dev.env"
+elif [[ "${env}" == "prod" ]]; then
+  project="${prod_project}"
+  service_account_id="${prod_service_account_id}"
+  env_file="prod.env"
+else
+  echo "Unknown env ${env}"
+  exit 1
+fi
+
 if [[ "${action}" == "backfill" ]]; then
-  docker build --tag ${project} .
+  docker build . --tag ${project}
 
   # Get service credentials if they are missing
   if [[ ! -f ~/.config/gcloud/${service_account_id}_compute_credentials.json ]]; then
@@ -47,15 +66,16 @@ if [[ "${action}" == "backfill" ]]; then
   fi
 
   # --entrypoint 'python3 -m pipeline.run_beam_tables --env=dev --full'
-  docker run -it \
+  docker run -it --env-file "${env_file}" \
   -v $HOME/.config/gcloud:$HOME/.config/gcloud \
   -e GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/gcloud/${service_account_id}_compute_credentials.json \
-  ${project}
+  ${project} 
 
-elif [[ "${action}" == "dev" ]]; then
+elif [[ "${action}" == "deploy" ]]; then
   # For builders outside the VPC security perimeter the build will succeed
   # but throw a logging error, so we ignore errors here
-  gcloud builds submit . --tag gcr.io/${project}/pipeline --project ${project} || true
+  gcloud builds submit . \
+  --tag gcr.io/${project}/pipeline --project ${project} || true
 
   # Instead check that the latest build succeeded
   if gcloud builds list --limit=1 --project ${project} | grep SUCCESS; then
@@ -70,14 +90,16 @@ elif [[ "${action}" == "dev" ]]; then
   if gcloud compute instances list --project ${project} | grep -q ${project}; then
     # update
     gcloud compute instances update-container ${project} --zone ${zone} \
-    --container-image gcr.io/${project}/pipeline:latest --project ${project}
+    --container-image gcr.io/${project}/pipeline:latest --project ${project} \
+    --container-env-file="${env_file}"
   else
     # otherwise create a new instance
     gcloud compute instances create-with-container ${project} \
     --container-image gcr.io/${project}/pipeline:latest \
     --machine-type e2-highmem-4 --zone ${zone} --boot-disk-size 50GB \
     --service-account ${service_account_id}[email protected] \
-    --scopes=bigquery,cloud-platform,default --project ${project}
+    --scopes=bigquery,cloud-platform,default --project ${project} \
+    --container-env-file="${env_file}"
   fi
 
 elif [[ "${action}" == "delete" ]]; then

diff --git a/dev.env b/dev.env
@@ -0,0 +1 @@
+PIPELINE_ENV=dev
diff --git a/docs/production.md b/docs/production.md
@@ -11,25 +11,33 @@ There are two main top-level pieces of the pipeline
 This sets up a daily data transfer job to copy scan files from the Censored
 Planet cloud bucket to an internal bucket.
 
- `python -m schedule_pipeline`
+ `python -m schedule_pipeline --env=dev`
 
 This does some additional daily data processing and schedules a daily
 incremental Apache Beam pipeline over the data. It expects to be run via a
 Docker container on a GCE machine.
 
- `./deploy.sh dev`
+ `./deploy.sh deploy prod`
 
 Will deploy the main pipeline loop to a GCE machine. If the machine does not
 exist it will be created, if it does exist it will be updated.
 
+To deploy to the dev project run
+
+ `./deploy.sh deploy dev`
+
 ## Turning off the Automated Pipeline
 
 So stop running the automated pipeline run
 
- `./deploy.sh delete`
+ `./deploy.sh delete prod`
 
 Which will delete the GCE machine
 
+To stop running the dev project run
+
+ `./deploy.sh delete dev`
+
 ## Running Manually
 
 Individual pieces of the pipeline can be run manually.
@@ -89,15 +97,20 @@ fail because of a schema mismatch.
 Here are the steps to run a backfill:
 
 *    Checkout master and make sure you're synced to the latest changes.
-*    `./deploy.sh delete` turn off the nightly pipeline so it doesn't conflict
-     with the backfill
+*    `./deploy.sh delete dev` and `./deploy.sh delete prod` turn off the nightly
+     pipeline so it doesn't conflict with the backfill.
 *    `python -m pipeline.run_beam_tables --env=dev --scan_type=all --full` to
      run manual backfill jobs, this can take several hours.
 *    Make sure a job is running for each scan type in dataflow. If some scan
      types didn't take then re-run them by hand.
 *    Check if the backfill worked the next day
-*    if so run `./deploy.sh dev` at head to turn the pipeline back on with
-     the new code
+*    If so copy the new base tables from dev to prod by running:
+*    ```bq mk --transfer_config --project_id=669508427087 \
+     --data_source=cross_region_copy --target_dataset=base \
+     --display_name="base tables" \
+     --params='{"source_dataset_id":"base","source_project_id":"firehook-censoredplanet","overwrite_destination_table":"true"}'```
+*    Redeploy `./deploy.sh deploy dev` and `./deploy.sh deploy prod` at head
+     to turn the pipeline back on with the new code.
 
 ## Access
 

diff --git a/firehook_resources.py b/firehook_resources.py
@@ -16,17 +16,22 @@
 PROD_PROJECT_NAME = 'censoredplanet-analysisv1'
 DEV_PROJECT_NAME = 'firehook-censoredplanet'
 
-# Buckets that store scanfiles
+# Bucket that stores scanfiles
 SCAN_BUCKET = 'censoredplanetscanspublic'
 INPUT_BUCKET = f'gs://{SCAN_BUCKET}/'
 
-# Buckets that store METADATA information
-# TODO change this bucket name to something metadata related.
-METADATA_BUCKET = 'censoredplanet_geolocation'
 ROUTEVIEW_PATH = 'caida/routeviews/'
-CAIDA_FILE_LOCATION = f'gs://{METADATA_BUCKET}/caida/'
-MAXMIND_FILE_LOCATION = f'gs://{METADATA_BUCKET}/maxmind/'
-DBIP_FILE_LOCATION = f'gs://{METADATA_BUCKET}/dbip/'
+
+# Buckets that store METADATA information
+DEV_METADATA_BUCKET = 'censoredplanet_geolocation'
+DEV_CAIDA_FILE_LOCATION = f'gs://{DEV_METADATA_BUCKET}/caida/'
+DEV_MAXMIND_FILE_LOCATION = f'gs://{DEV_METADATA_BUCKET}/maxmind/'
+DEV_DBIP_FILE_LOCATION = f'gs://{DEV_METADATA_BUCKET}/dbip/'
+
+PROD_METADATA_BUCKET = 'censoredplanet_ip_metadata'
+PROD_CAIDA_FILE_LOCATION = f'gs://{PROD_METADATA_BUCKET}/caida/'
+PROD_MAXMIND_FILE_LOCATION = f'gs://{PROD_METADATA_BUCKET}/maxmind/'
+PROD_DBIP_FILE_LOCATION = f'gs://{PROD_METADATA_BUCKET}/dbip/'
 
 # Output GCS Buckets
 DEV_OUTPUT_BUCKET = 'firehook-test'

diff --git a/mirror/routeviews/bulk_download.py b/mirror/routeviews/bulk_download.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Bulk importer for CAIDA routeview files."""
 
+import argparse
 import datetime
 import urllib.request
 
@@ -78,12 +79,30 @@ def download_days_routeview(bucket: storage.bucket.Bucket,
         raise ex
 
 
-def download_manual_routeviews_firehook() -> None:
+def download_manual_routeviews_firehook(env: str) -> None:
+  """Download routeviews for a given project
+
+  Args:
+    env: one of 'dev' or 'prod', which gcloud project env to use.
+  """
   client = storage.Client()
-  bucket = client.get_bucket(firehook_resources.METADATA_BUCKET)
+  if env == 'dev':
+    bucket = client.get_bucket(firehook_resources.DEV_METADATA_BUCKET)
+  if env == 'prod':
+    bucket = client.get_bucket(firehook_resources.PROD_METADATA_BUCKET)
 
   download_manual_routeviews(bucket)
 
 
 if __name__ == "__main__":
-  download_manual_routeviews_firehook()
+  parser = argparse.ArgumentParser(
+      description='Manually download routeview files for project.')
+  parser.add_argument(
+      '--env',
+      type=str,
+      default='dev',
+      choices=['dev', 'prod'],
+      help='Whether to write to prod or dev gcloud project')
+  args = parser.parse_args()
+
+  download_manual_routeviews_firehook(args.env)
diff --git a/mirror/routeviews/sync_routeviews.py b/mirror/routeviews/sync_routeviews.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Mirror the latest CAIDA routeview files into a cloud bucket."""
 
+import argparse
 import os
 import pathlib
 from pprint import pprint
@@ -110,14 +111,31 @@ def sync(self) -> None:
       pprint(("transferred file: ", new_file))
 
 
-def get_firehook_routeview_mirror() -> RouteviewMirror:
-  """Factory function to get a RouteviewUpdater with our project values."""
+def get_firehook_routeview_mirror(env: str) -> RouteviewMirror:
+  """Factory function to get a RouteviewUpdater with our project values.
+
+  Args:
+    env: one of 'dev' or 'prod' which gcloud project env to use.
+  """
   client = storage.Client()
-  bucket = client.get_bucket(firehook_resources.METADATA_BUCKET)
+  if env == 'dev':
+    bucket = client.get_bucket(firehook_resources.DEV_METADATA_BUCKET)
+  if env == 'prod':
+    bucket = client.get_bucket(firehook_resources.PROD_METADATA_BUCKET)
 
   return RouteviewMirror(bucket, firehook_resources.ROUTEVIEW_PATH)
 
 
 if __name__ == "__main__":
   # Called manually when running a backfill.
-  get_firehook_routeview_mirror().sync()
+  parser = argparse.ArgumentParser(
+      description='Download latest routeview files.')
+  parser.add_argument(
+      '--env',
+      type=str,
+      default='dev',
+      choices=['dev', 'prod'],
+      help='Whether to write to prod or dev gcloud project')
+  args = parser.parse_args()
+
+  get_firehook_routeview_mirror(args.env).sync()
diff --git a/pipeline/manual_e2e_test.py b/pipeline/manual_e2e_test.py
@@ -669,7 +669,7 @@ def test_invalid_pipeline(self) -> None:
   def test_ipmetadata_init(self) -> None:
     """Test getting getting routeview metadata from prod."""
     caida_ip_metadata_db = caida_ip_metadata.get_firehook_caida_ip_metadata_db(
-        datetime.date(2018, 7, 27))
+        'dev', datetime.date(2018, 7, 27))
 
     metadata = caida_ip_metadata_db.lookup('1.1.1.1')
     self.assertEqual(metadata, ('1.1.1.0/24', 13335, 'CLOUDFLARENET',
@@ -690,7 +690,7 @@ def test_ipmetadata_init(self) -> None:
   def test_maxmind_init(self) -> None:
     """Test getting maxmind metadata from prod."""
     maxmind_db = maxmind.MaxmindIpMetadata(
-        firehook_resources.MAXMIND_FILE_LOCATION)
+        firehook_resources.DEV_MAXMIND_FILE_LOCATION)
 
     metadata = maxmind_db.lookup('1.1.1.1')
     self.assertEqual(metadata, ('1.1.1.0/24', 13335, 'CLOUDFLARENET', 'AU'))
@@ -705,7 +705,7 @@ def test_maxmind_init(self) -> None:
 
   def test_dbip_init(self) -> None:
     """Test DBIP database access from prod."""
-    dbip_data = dbip.DbipMetadata(firehook_resources.DBIP_FILE_LOCATION)
+    dbip_data = dbip.DbipMetadata(firehook_resources.DEV_DBIP_FILE_LOCATION)
 
     (org, asn) = dbip_data.lookup('1.211.95.160')
     self.assertEqual(org, "Boranet")

diff --git a/pipeline/metadata/caida_ip_metadata.py b/pipeline/metadata/caida_ip_metadata.py
@@ -333,12 +333,14 @@ def lookup(self, ip: str) -> CaidaReturnValues:
 
 
 def get_firehook_caida_ip_metadata_db(
+    env: str,
     date: datetime.date,
     allow_previous_day: bool = False,
 ) -> CaidaIpMetadata:
   """Factory to return an CaidaIpMetadata object which reads in firehook files.
 
   Args:
+    env: one of 'dev' or 'prod' which gcloud project env to use.
     date: a date to initialize the asn database to
     allow_previous_day: If the given date's routeview file doesn't exist, allow
       the one from the previous day instead. This is useful when processing very
@@ -349,5 +351,9 @@ def get_firehook_caida_ip_metadata_db(
   """
   # import here to avoid beam pickling issues
   import firehook_resources  # pylint: disable=import-outside-toplevel
-  return CaidaIpMetadata(date, firehook_resources.CAIDA_FILE_LOCATION,
-                         allow_previous_day)
+  if env == 'dev':
+    file_location = firehook_resources.DEV_CAIDA_FILE_LOCATION
+  if env == 'prod':
+    file_location = firehook_resources.DEV_CAIDA_FILE_LOCATION
+
+  return CaidaIpMetadata(date, file_location, allow_previous_day)
diff --git a/pipeline/run_beam_tables.py b/pipeline/run_beam_tables.py
@@ -101,17 +101,22 @@ def get_beam_pipeline_runner(
   # importing here to avoid beam pickling issues
   import firehook_resources  # pylint: disable=import-outside-toplevel
 
-  metadata_chooser_factory = IpMetadataChooserFactory(
-      firehook_resources.CAIDA_FILE_LOCATION,
-      firehook_resources.MAXMIND_FILE_LOCATION,
-      firehook_resources.DBIP_FILE_LOCATION)
-
   if env in ('dev', 'user'):
+    metadata_chooser_factory = IpMetadataChooserFactory(
+        firehook_resources.DEV_CAIDA_FILE_LOCATION,
+        firehook_resources.DEV_MAXMIND_FILE_LOCATION,
+        firehook_resources.DEV_DBIP_FILE_LOCATION)
+
     project_name = firehook_resources.DEV_PROJECT_NAME
     staging_location = firehook_resources.DEV_BEAM_STAGING_LOCATION
     temp_location = firehook_resources.DEV_BEAM_TEMP_LOCATION
     output_bucket = firehook_resources.DEV_OUTPUT_BUCKET
   if env == 'prod':
+    metadata_chooser_factory = IpMetadataChooserFactory(
+        firehook_resources.PROD_CAIDA_FILE_LOCATION,
+        firehook_resources.PROD_MAXMIND_FILE_LOCATION,
+        firehook_resources.PROD_DBIP_FILE_LOCATION)
+
     project_name = firehook_resources.PROD_PROJECT_NAME
     staging_location = firehook_resources.PROD_BEAM_STAGING_LOCATION
     temp_location = firehook_resources.PROD_BEAM_TEMP_LOCATION

diff --git a/prod.env b/prod.env
@@ -0,0 +1 @@
+PIPELINE_ENV=prod