Skip to content

Commit

Permalink
Merge pull request #58 from Sage-Bionetworks/ETL-91
Browse files Browse the repository at this point in the history
Etl-91 Job / artifact versioning
  • Loading branch information
tthyer authored Mar 21, 2022
2 parents 3222e2c + 51ae4f0 commit c8a57c9
Show file tree
Hide file tree
Showing 11 changed files with 72 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cleanup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ jobs:
run: pipenv run sceptre --debug --var "namespace=${{ github.event.ref }}" delete develop/namespaced --yes

- name: Remove artifacts
run: pipenv run python src/scripts/manage_artifacts/artifacts.py --remove --namespace "${{ github.event.ref }}"
run: pipenv run python src/scripts/manage_artifacts/artifacts.py --remove --ref "${{ github.event.ref }}"
6 changes: 3 additions & 3 deletions .github/workflows/upload-and-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ jobs:
with:
version: 1.37.0

- name: Set namespace for branch
- name: Set namespace for non-default branch or for tag
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV

- name: Copy files to templates bucket
run: python src/scripts/manage_artifacts/artifacts.py --upload --namespace ${{ env.NAMESPACE }}
run: python src/scripts/manage_artifacts/artifacts.py --upload --ref $GITHUB_REF_NAME

- name: Login to Amazon ECR
id: login-ecr
Expand All @@ -62,7 +62,7 @@ jobs:
--no-progressbar
aws s3 cp .aws-sam/build/template.yaml \
s3://$CFN_BUCKET/$REPO_NAME/${{ env.NAMESPACE }}/templates/lambda/sns_to_glue/
s3://$CFN_BUCKET/$REPO_NAME/$GITHUB_REF_NAME/templates/lambda/sns_to_glue/
sceptre-deploy-branch:
name: Deploy branch using sceptre
Expand Down
2 changes: 1 addition & 1 deletion config/develop/namespaced/example-app-1-study-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ parameters:
AppName: example-app-1
StudyName: study-1
TemplateBucketName: {{ stack_group_config.artifact_bucket_name }}
ArtifactRef: ETL-91
SourceBucketName: !stack_output_external bridge-downstream-dev-source-bucket::BucketName
JsonBucketName: !stack_output_external bridge-downstream-dev-intermediate-bucket::BucketName
ParquetBucketName: !stack_output_external bridge-downstream-dev-parquet-bucket::BucketName
RoleArn: !stack_output_external glue-job-role::RoleArn
ClassifierName: !stack_output_external array-of-records-classifier::ClassifierName
SynapseAuthSsmParameterName: '{{ stack_group_config.synapseAuthSsmParameterName }}'
UniqueId: !rcmd git rev-parse HEAD
S3ToJsonS3JobName: !stack_output_external '{{ ns }}-glue-job-S3ToJsonS3::JobName'

stack_tags:
Expand Down
4 changes: 2 additions & 2 deletions config/develop/namespaced/glue-job-S3ToJsonS3.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#{% set ns = stack_group_config.namespace %}
#{% set glue_script_tag = 'ETL-91' %}
template_path: glue-spark-job.j2
dependencies:
- develop/s3-intermediate-bucket.yaml
Expand All @@ -9,8 +10,7 @@ parameters:
BookmarkOption: job-bookmark-disable
JobDescription: Convert data to JSONS3 data
MaxConcurrentRuns: '150'
S3ScriptLocation: s3://{{ stack_group_config.artifact_bucket_name }}/BridgeDownstream/{{ stack_group_config.namespace }}/glue/jobs/s3_to_json_s3.py
UniqueId: !rcmd git rev-parse HEAD
S3ScriptLocation: s3://{{ stack_group_config.artifact_bucket_name }}/BridgeDownstream/{{ glue_script_tag }}/glue/jobs/s3_to_json_s3.py
SynapseAuthSsmParameterName: {{ stack_group_config.synapseAuthSsmParameterName }}
AdditionalPythonModules: 'synapseclient'
DatasetMapping: s3://{{ stack_group_config.artifact_bucket_name }}/BridgeDownstream/{{ stack_group_config.namespace }}/glue/resources/dataset_mapping.json
Expand Down
11 changes: 9 additions & 2 deletions src/lambda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,16 @@ Test the function by invoking it directly with a test event. Test events are
included in the `events` folder.

To test the lambda locally, run the following command from the lambda directory.
Ensure that `event.json` contains a valid Synapse id, and that `test-env-vars.json`
contains all the environment vars that are expected in `sns_to_glue/app.py`.
Use `singe-record.json` to test just one assessment or `records.json` to test
multiple. The file `test-env-vars.json` contains environmental variables that
are epxected by the lambda script.

To invoke the lambda with one event:
```bash
lambda$ sam local invoke -e events/single-record.json --env-vars test-env-vars.json
```

To invoke the lambda with multiple events:
```bash
lambda$ sam local invoke -e events/records.json --env-vars test-env-vars.json
```
2 changes: 1 addition & 1 deletion src/lambda/events/records.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/lambda/events/single-record.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"Records": [{"messageId": "20530d39-538a-5330-b376-57429074a158", "receiptHandle": "AQEBwJnKyrHigUMZj6rYigCgxlaS3SLy0a...", "attributes": {"ApproximateReceiveCount": "1", "SentTimestamp": "1545082649183", "SenderId": "AIDAIENQZJOLO23YVJ4VO", "ApproximateFirstReceiveTimestamp": "1545082649185"}, "body": "{\"Type\": \"Notification\", \"MessageId\": \"45127357-4996-58e1-af22-2922586ba8f2\", \"TopicArn\": \"arn:aws:sns:us-east-1:563295687221:phil_s3_bucket_update\", \"Message\": \"{\\\"appId\\\": \\\"example-app-1\\\", \\\"recordId\\\": \\\"-4I2GOqDSdjaXsbuw8oYXBKK\\\", \\\"record\\\": {\\\"parentProjectId\\\": \\\"syn26721259\\\", \\\"rawFolderId\\\": \\\"syn27199726\\\", \\\"fileEntityId\\\": \\\"syn27199739\\\", \\\"s3Bucket\\\": \\\"bridge-downstream-dev-source\\\", \\\"s3Key\\\": \\\"test-data/3FLD5VVuUivuO9OcI-6700G8-raw.zip\\\"}, \\\"studyRecords\\\": {\\\"study-1\\\": {\\\"parentProjectId\\\": \\\"syn26721259\\\", \\\"rawFolderId\\\": \\\"syn27199726\\\", \\\"fileEntityId\\\": \\\"syn27199739\\\", \\\"s3Bucket\\\": \\\"bridge-downstream-dev-source\\\", \\\"s3Key\\\": \\\"test-data/3FLD5VVuUivuO9OcI-6700G8-raw.zip\\\"}}}\", \"Timestamp\": \"2022-02-02T23:11:57.105Z\", \"SignatureVersion\": \"1\", \"Signature\": \"uJ4zpc5M/dImqUxw2uABcl8V2WeBkXRZolX4wwtVxyqp/OG5IqR0upEH35Pp7WHx2/tpAzMnSImjOFsqfveFce4cDum1CtQtlj7mkZyxq+sV1VKxgJot2N8DzMxTBxVmNELc9fbOGgukSwv76dQJ0tiu0GUITmL/8tHcRacimPkElPL6ZC9jFIiR0MM6f2wZkwbRMbvfo1sOdjYcF9VzD4J0fe6qbHjKFGoTGYQ98hJCgMU8mknTHWoGu2InLPAOZZ+hNl+gt/lCS7oihP1rBMoGg+yi8wF/F2bcoKierEuF5DmAkPkxOHi7j8ikfBmJ2o/zDFknx6XmRL4a9rMUow==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-7ff5318490ec183fbaddaa2a969abfda.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:563295687221:phil_s3_bucket_update:89b889dd-9e68-41a5-b865-c194f09d024c\"}", "messageAttributes": {}, "md5OfBody": "e4e68fb7bd0e697a0ae8f1bb342846b3", "eventSource": "aws:sqs", "eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:my-queue", "awsRegion": "us-east-1"}]}
{"Records": [{"messageId": "20530d39-538a-5330-b376-57429074a158", "receiptHandle": "AQEBwJnKyrHigUMZj6rYigCgxlaS3SLy0a...", "attributes": {"ApproximateReceiveCount": "1", "SentTimestamp": "1545082649183", "SenderId": "AIDAIENQZJOLO23YVJ4VO", "ApproximateFirstReceiveTimestamp": "1545082649185"}, "body": "{\"Type\": \"Notification\", \"MessageId\": \"45127357-4996-58e1-af22-2922586ba8f2\", \"TopicArn\": \"arn:aws:sns:us-east-1:563295687221:phil_s3_bucket_update\", \"Message\": \"{\\\"appId\\\": \\\"example-app-1\\\", \\\"recordId\\\": \\\"-4I2GOqDSdjaXsbuw8oYXBKK\\\", \\\"record\\\": {\\\"parentProjectId\\\": \\\"syn26721259\\\", \\\"rawFolderId\\\": \\\"syn27558289\\\", \\\"fileEntityId\\\": \\\"syn27558320\\\", \\\"s3Bucket\\\": \\\"bridge-downstream-dev-source\\\", \\\"s3Key\\\": \\\"test-data/3FLD5VVuUivuO9OcI-6700G8-raw.zip\\\"}, \\\"studyRecords\\\": {\\\"study-1\\\": {\\\"parentProjectId\\\": \\\"syn26721259\\\", \\\"rawFolderId\\\": \\\"syn27558289\\\", \\\"fileEntityId\\\": \\\"syn27558320\\\", \\\"s3Bucket\\\": \\\"bridge-downstream-dev-source\\\", \\\"s3Key\\\": \\\"test-data/3FLD5VVuUivuO9OcI-6700G8-raw.zip\\\"}}}\", \"Timestamp\": \"2022-02-02T23:11:57.105Z\", \"SignatureVersion\": \"1\", \"Signature\": \"uJ4zpc5M/dImqUxw2uABcl8V2WeBkXRZolX4wwtVxyqp/OG5IqR0upEH35Pp7WHx2/tpAzMnSImjOFsqfveFce4cDum1CtQtlj7mkZyxq+sV1VKxgJot2N8DzMxTBxVmNELc9fbOGgukSwv76dQJ0tiu0GUITmL/8tHcRacimPkElPL6ZC9jFIiR0MM6f2wZkwbRMbvfo1sOdjYcF9VzD4J0fe6qbHjKFGoTGYQ98hJCgMU8mknTHWoGu2InLPAOZZ+hNl+gt/lCS7oihP1rBMoGg+yi8wF/F2bcoKierEuF5DmAkPkxOHi7j8ikfBmJ2o/zDFknx6XmRL4a9rMUow==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-7ff5318490ec183fbaddaa2a969abfda.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:563295687221:phil_s3_bucket_update:89b889dd-9e68-41a5-b865-c194f09d024c\"}", "messageAttributes": {}, "md5OfBody": "e4e68fb7bd0e697a0ae8f1bb342846b3", "eventSource": "aws:sqs", "eventSourceARN": "arn:aws:sqs:us-east-1:123456789012:my-queue", "awsRegion": "us-east-1"}]}
32 changes: 32 additions & 0 deletions src/scripts/bookmarks/get-bookmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python3

import argparse
import boto3


def read_args():
descriptions = '''
Get job bookmarks.
'''
parser = argparse.ArgumentParser(
description='')
parser.add_argument(
'--namespace',
default='bridge-downstream')


def main(namespace):
client = boto3.client('glue')
response = client.get_jobs()
all_jobs = response['Jobs']
job_names = [job['Name'] for job in all_jobs if job['Name'].startswith(namespace)]

for job_name in job_names:
print('')
response = client.get_job_bookmark(JobName=job_name)
print(response)


if __name__ == "__main__":
args = read_args()
main(args.namespace)
25 changes: 12 additions & 13 deletions src/scripts/manage_artifacts/artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ def read_args():
parser = argparse.ArgumentParser(
description='')
parser.add_argument(
'--namespace',
default='bridge-downstream')
'--ref')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--upload', action='store_true')
group.add_argument('--remove', action='store_true')
Expand All @@ -27,40 +26,40 @@ def execute_command(cmd):
subprocess.run(cmd)


def upload(namespace):
def upload(ref):
'''Copies Glue scripts and CFN templates to the artifacts bucket'''
scripts_local_path = 'src/glue/'
scripts_s3_path = f's3://{cfn_bucket}/{repo_name}/{namespace}/glue/'
scripts_s3_path = f's3://{cfn_bucket}/{repo_name}/{ref}/glue/'
cmd = ['aws', 's3', 'sync', scripts_local_path, scripts_s3_path]
execute_command(cmd)

templates_local_path = 'templates/'
templates_s3_path = f's3://{cfn_bucket}/{repo_name}/{namespace}/templates/'
templates_s3_path = f's3://{cfn_bucket}/{repo_name}/{ref}/templates/'
cmd = ['aws', 's3', 'sync', templates_local_path, templates_s3_path]
execute_command(cmd)


def delete(namespace):
'''Removes all files recursively for namespace'''
s3_path = f's3://{cfn_bucket}/{repo_name}/{namespace}/'
def delete(ref):
'''Removes all files recursively for ref'''
s3_path = f's3://{cfn_bucket}/{repo_name}/{ref}/'
cmd = ['aws', 's3', 'rm', '--recursive', s3_path]
execute_command(cmd)


def list_namespaces():
'''List all namespaces'''
def list_refs():
'''List all refs'''
s3_path = f's3://{cfn_bucket}/{repo_name}/'
cmd = ['aws','s3','ls', s3_path]
execute_command(cmd)


def main(args):
if args.upload:
upload(args.namespace)
upload(args.ref)
elif args.remove:
delete(args.namespace)
delete(args.ref)
else:
list_namespaces()
list_refs()

if __name__ == "__main__":
args = read_args()
Expand Down
7 changes: 1 addition & 6 deletions templates/glue-spark-job.j2
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ Parameters:
Description: The job timeout in minutes (integer).
Default: 120

UniqueId:
Type: String
Description: A unique id for producing unique job names
Default: ''

WorkerType:
Type: String
Description: >-
Expand Down Expand Up @@ -138,7 +133,7 @@ Resources:
MaxConcurrentRuns: !Ref MaxConcurrentRuns
GlueVersion: !Ref GlueVersion
MaxRetries: !Ref MaxRetries
Name: !Sub '${AWS::StackName}-Job-${UniqueId}'
Name: !Sub '${AWS::StackName}-Job'
NumberOfWorkers: !Ref NumberOfWorkers
Role: !Ref JobRole
Timeout: !Ref TimeoutInMinutes
Expand Down
19 changes: 9 additions & 10 deletions templates/study-pipeline-infra.j2
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Parameters:
Description: Study whose data this pipeline infrastructure processes
AllowedPattern: '[a-z]{1}[a-z0-9.-]*'

ArtifactRef:
Type: String
Description: A branch name or a tag

RoleArn:
Type: String
Description: The ARN of an IAM role that's used to access S3
Expand Down Expand Up @@ -71,11 +75,6 @@ Parameters:
The name of an ssm parameter whose value is Synapse service account
personal access token

UniqueId:
Type: String
Description: A unique id for producing unique job names.
Default: ''

S3ToJsonS3JobName:
Type: String
Description: The name of the S3 To JSON S3 Job
Expand Down Expand Up @@ -112,7 +111,7 @@ Resources:
Properties:
Command:
Name: glueetl
ScriptLocation: !Sub s3://${TemplateBucketName}/${CodeRepositoryName}/${Namespace}/glue/jobs/json_s3_to_parquet.py
ScriptLocation: !Sub s3://${TemplateBucketName}/${CodeRepositoryName}/${ArtifactRef}/glue/jobs/json_s3_to_parquet.py
DefaultArguments:
--TempDir: !Sub s3://${JsonBucketName}/tmp
--enable-continuous-cloudwatch-log: true
Expand All @@ -126,9 +125,9 @@ Resources:
Description: {{ 'Export {} data in parquet format'.format(dataset['dataset_name']) }}
ExecutionProperty:
MaxConcurrentRuns: 1
GlueVersion: '3.0' # Spark 3.1.1, Python 3.7 # parameterize?
MaxRetries: 0 # change this when not in development; parameterize?
Name: !Sub '${Namespace}-${AppName}-${StudyName}-{{dataset['stackname_prefix']}}-Job-${UniqueId}'
GlueVersion: '3.0' # Spark 3.1.1, Python 3.7 # TODO: parameterize
MaxRetries: 0 # change this when not in development; TODO: parameterize
Name: !Sub '${Namespace}-${AppName}-${StudyName}-{{dataset['stackname_prefix']}}-Job'
NumberOfWorkers: 1
Role: !Ref RoleArn
Timeout: 120
Expand Down Expand Up @@ -284,7 +283,7 @@ Resources:
LambdaStack:
Type: AWS::CloudFormation::Stack
Properties:
TemplateURL: !Sub https://${TemplateBucketName}.s3.amazonaws.com/${CodeRepositoryName}/${Namespace}/templates/lambda/sns_to_glue/template.yaml
TemplateURL: !Sub https://${TemplateBucketName}.s3.amazonaws.com/${CodeRepositoryName}/${ArtifactRef}/templates/lambda/sns_to_glue/template.yaml
Parameters:
WorkflowName: !Ref S3ToJsonWorkflow
Namespace: !Ref Namespace

0 comments on commit c8a57c9

Please sign in to comment.