From fee014f10a12e39bc6b5d855b9404b9ae151a00c Mon Sep 17 00:00:00 2001 From: Allan Denot Date: Wed, 8 Apr 2020 16:35:28 +1030 Subject: [PATCH] Blue green cutover and improvements in logging (#7) * Changing how script waits for deployment to end (wip) * Wait for deployment to be created * Update Deploy.sh Due the known issue on Codedeploy, CodeDeploy will fail the deployment if the ECS service is unhealthy/unstable for 5mins for replacement taskset during the wait status, this 5mins is a non-configurable value as today. For the reason above we wait for 10 minutes before consider the deployment in ready status as successful * Create stop.sh Adding script stop.sh, to be triggered if customer doesn't approval the new version to be deployed, after accessing and testing the application through test listener * Update Dockerfile Adding script stop.sh * Cleanup and working on status to fail early if user cancels or deployment fails, instead of waiting for 10 minutes * Refactoring script to wait for deployment when cutover is enabled Co-authored-by: elliot-dnx <62494315+elliot-dnx@users.noreply.github.com> --- Dockerfile | 4 +- docker-compose.yml | 18 +++++ src/app-spec.tpl.json | 21 ++--- src/{cutover.sh => deploy-cutover.sh} | 15 +--- src/deploy-stop.sh | 20 +++++ src/deploy.sh | 110 +++++++++++++++++++++----- src/tail-ecs-events.py | 2 +- src/tail-task-logs.py | 65 ++++++++------- 8 files changed, 173 insertions(+), 82 deletions(-) create mode 100644 docker-compose.yml rename src/{cutover.sh => deploy-cutover.sh} (57%) create mode 100644 src/deploy-stop.sh diff --git a/Dockerfile b/Dockerfile index d6aac2e..55255ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM dnxsolutions/aws:1.16.263-dnx2 +FROM dnxsolutions/aws:1.17.14-dnx3 RUN apk --no-cache update && \ apk --no-cache add python3 && \ @@ -8,7 +8,7 @@ RUN pip3 install --no-cache --upgrade boto3 ADD src . -RUN chmod +x deploy.sh task-deploy.sh run-task.sh cutover.sh tail-task-logs.py +RUN chmod +x *.sh *.py ENTRYPOINT [ "/bin/bash", "-c" ] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..de932f1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,18 @@ +version: '3.4' + +services: + app: + build: . + volumes: + - .:/work + environment: + - AWS_ACCESS_KEY_ID + - AWS_ACCOUNT_ID + - AWS_DEFAULT_REGION + - AWS_ROLE + - AWS_SECRET_ACCESS_KEY + - AWS_SECURITY_TOKEN + - AWS_SESSION_EXPIRATION + - AWS_SESSION_TOKEN + entrypoint: "" + command: /bin/bash diff --git a/src/app-spec.tpl.json b/src/app-spec.tpl.json index 8b31b52..c676c77 100755 --- a/src/app-spec.tpl.json +++ b/src/app-spec.tpl.json @@ -1,17 +1,6 @@ { - "version": 1, - "Resources": [ - { - "TargetService": { - "Type": "AWS::ECS::Service", - "Properties": { - "TaskDefinition": "$TASK_ARN", - "LoadBalancerInfo": { - "ContainerName": "$APP_NAME", - "ContainerPort": $CONTAINER_PORT - } - } - } - } - ] -} + "revisionType": "AppSpecContent", + "appSpecContent": { + "content": "{\"version\":1,\"Resources\":[{\"TargetService\":{\"Type\":\"AWS::ECS::Service\",\"Properties\":{\"TaskDefinition\":\"$TASK_ARN\",\"LoadBalancerInfo\":{\"ContainerName\":\"$APP_NAME\",\"ContainerPort\":$CONTAINER_PORT}}}}]}" + } +} \ No newline at end of file diff --git a/src/cutover.sh b/src/deploy-cutover.sh similarity index 57% rename from src/cutover.sh rename to src/deploy-cutover.sh index 7b7fd91..f61cf38 100755 --- a/src/cutover.sh +++ b/src/deploy-cutover.sh @@ -1,31 +1,20 @@ #!/bin/bash -e -ERROR=0 if [[ -z "$AWS_DEFAULT_REGION" ]]; then echo "---> ERROR: Missing variable AWS_DEFAULT_REGION"; ERROR=1; fi if [[ -z "$APP_NAME" ]]; then echo "---> ERROR: Missing variable APP_NAME"; ERROR=1; fi if [[ -z "$CLUSTER_NAME" ]]; then echo "---> ERROR: Missing variable CLUSTER_NAME"; ERROR=1; fi -if [[ -z "$CONTAINER_PORT" ]]; then echo "---> ERROR: Missing variable CONTAINER_PORT"; ERROR=1; fi -if [[ -z "$IMAGE_NAME" ]]; then echo "---> ERROR: Missing variable IMAGE_NAME"; ERROR=1; fi -if [[ "$ERROR" == "1" ]]; then exit 1; fi # Fetch deployment ID pending cutover to the green(new) enviroment DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1) -DEPLOYMENT_PID=$! - -#echo "---> For More Deployment info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID" - -#echo "---> Waiting for Deployment ..." - aws deploy continue-deployment --deployment-id $DEPLOYMENT_ID --deployment-wait-type "READY_WAIT" -wait $DEPLOYMENT_PID RET=$? if [ $RET -eq 0 ]; then - echo "---> Deployment completed!" + echo "---> Cutover engaged!" else - echo "---> ERROR: Deployment FAILED!" + echo "---> ERROR: Cutover FAILED!" fi exit $RET diff --git a/src/deploy-stop.sh b/src/deploy-stop.sh new file mode 100644 index 0000000..90c5854 --- /dev/null +++ b/src/deploy-stop.sh @@ -0,0 +1,20 @@ +#!/bin/bash -e + +if [[ -z "$AWS_DEFAULT_REGION" ]]; then echo "---> ERROR: Missing variable AWS_DEFAULT_REGION"; ERROR=1; fi +if [[ -z "$APP_NAME" ]]; then echo "---> ERROR: Missing variable APP_NAME"; ERROR=1; fi +if [[ -z "$CLUSTER_NAME" ]]; then echo "---> ERROR: Missing variable CLUSTER_NAME"; ERROR=1; fi + +# Fetch deployment ID pending cutover to the green(new) enviroment +DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1) + +aws deploy stop-deployment --deployment-id $DEPLOYMENT_ID + +RET=$? + +if [ $RET -eq 0 ]; then + echo "---> Deployment stopped!" +else + echo "---> ERROR: Deployment stopped FAILED!" +fi + +exit $RET diff --git a/src/deploy.sh b/src/deploy.sh index fc7d57e..cf7707f 100755 --- a/src/deploy.sh +++ b/src/deploy.sh @@ -17,47 +17,115 @@ envsubst < task-definition.tpl.json > task-definition.json echo "---> Task Definition" cat task-definition.json -export TASK_ARN=TASK_ARN_PLACEHOLDER +export TASK_ARN=$(aws ecs register-task-definition --cli-input-json file://./task-definition.json | jq --raw-output '.taskDefinition.taskDefinitionArn') envsubst < app-spec.tpl.json > app-spec.json +echo echo "---> App-spec for CodeDeploy" cat app-spec.json +echo echo "---> Creating deployment with CodeDeploy" set +e # disable bash exit on error -# Update the ECS service to use the updated Task version -aws ecs deploy \ - --service $APP_NAME \ - --task-definition ./task-definition.json \ - --cluster $CLUSTER_NAME \ - --codedeploy-appspec ./app-spec.json \ - --codedeploy-application $CLUSTER_NAME-$APP_NAME \ - --codedeploy-deployment-group $CLUSTER_NAME-$APP_NAME & +# # Update the ECS service to use the updated Task version +DEPLOYMENT_ID=$(aws deploy create-deployment \ + --application-name $CLUSTER_NAME-$APP_NAME \ + --deployment-config-name CodeDeployDefault.ECSAllAtOnce \ + --deployment-group-name $CLUSTER_NAME-$APP_NAME \ + --description Deployment \ + --revision file://app-spec.json \ + --query="deploymentId" --output text) + +# In case there is already a deployment in progress, script will fail +if [ $? -eq 255 ]; then + echo + echo + echo "===> Deployment already in progress. Please approve current deployment before performing a new deployment" + echo + echo + exit 1 +fi + +sleep 5 # Wait for deployment to be created -DEPLOYMENT_PID=$! +echo "---> For more info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID" -sleep 5 # Wait for deployment to be created so we can fetch DEPLOYMENT_ID next +/work/tail-ecs-events.py & +TAIL_ECS_EVENTS_PID=$! -DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1) +RET=0 -echo "---> For More Deployment info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID" +while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "Created" ] +do + sleep 1 +done -echo "---> Waiting for Deployment ..." +echo "---> Deployment created!" -/work/tail-ecs-events.py & -TAIL_PID=$! +while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "InProgress" ] +do + sleep 1 +done -wait $DEPLOYMENT_PID -RET=$? +TASK_SET_ID=$(aws ecs describe-services --cluster $CLUSTER_NAME --service $APP_NAME --query "services[0].taskSets[?status == 'ACTIVE'].id" --output text) +if [ "${TASK_SET_ID}" != "" ]; then + echo "---> Task Set ID: $TASK_SET_ID" +fi + +# Due the known issue on Codedeploy, CodeDeploy will fail the deployment if the ECS service is unhealthy/unstable for 5mins for replacement +# taskset during the wait status, this 5mins is a non-configurable value as today. +# For the reason above we wait for 10 minutes before consider the deployment in ready status as successful + +WAIT_PERIOD=0 +MAX_WAIT=300 #$(aws ecs describe-services --cluster $CLUSTER_NAME --service $APP_NAME --query services[0].healthCheckGracePeriodSeconds --output text) +MAX_WAIT_BUFFER=60 + +echo +echo +echo "---> Waiting $((MAX_WAIT + MAX_WAIT_BUFFER)) seconds for tasks to stabilise" +echo + +while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "Ready" ] +do + if [ "$WAIT_PERIOD" -ge "$((MAX_WAIT + MAX_WAIT_BUFFER))" ]; then + break + fi + sleep 10 + WAIT_PERIOD=$((WAIT_PERIOD + 10)) +done + +DEPLOYMENT_STATUS=$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text) +echo +echo "---> Deployment status: $DEPLOYMENT_STATUS" +echo + +if [ "$DEPLOYMENT_STATUS" == "Failed" ] +then + TASK_ARN=$(aws ecs list-tasks --cluster dev --desired-status STOPPED --started-by $TASK_SET_ID --query taskArns[0] --output text) + if [ "${TASK_ARN}" != "None" ]; then + echo "---> Displaying logs of STOPPED task: $TASK_ARN" + echo + /work/tail-task-logs.py $TASK_ARN + fi + RET=1 +elif [ "$DEPLOYMENT_STATUS" == "Stopped" ] +then + RET=1 +elif [ "$DEPLOYMENT_STATUS" == "Succeeded" ] +then + RET=0 +fi if [ $RET -eq 0 ]; then - echo "---> Deployment completed!" + echo + echo "---> Completed!" else + echo echo "---> ERROR: Deployment FAILED!" fi -kill $TAIL_PID +kill $TAIL_ECS_EVENTS_PID -exit $RET +exit $RET \ No newline at end of file diff --git a/src/tail-ecs-events.py b/src/tail-ecs-events.py index 07ce048..4b45a83 100755 --- a/src/tail-ecs-events.py +++ b/src/tail-ecs-events.py @@ -28,7 +28,7 @@ events_collected.insert(0, event) for event_collected in events_collected: - print('%s\t%s' % ('{0:%Y-%m-%d %H:%M:%S %z}'.format(event_collected['createdAt']), event_collected['message'])) + print('%s %s' % ('{0:%Y-%m-%d %H:%M:%S %z}'.format(event_collected['createdAt']), event_collected['message'])) last_event = events[0]['id'] time.sleep(5) diff --git a/src/tail-task-logs.py b/src/tail-task-logs.py index 62f3e3c..d89b427 100755 --- a/src/tail-task-logs.py +++ b/src/tail-task-logs.py @@ -1,49 +1,56 @@ #!/usr/bin/env python3 -import boto3, json, time, os, datetime +import boto3, json, time, os, datetime, sys aws_ecs = boto3.client('ecs') +logs = boto3.client('logs') cluster_name=os.environ['CLUSTER_NAME'] app_name=os.environ['APP_NAME'] -task_arn=os.environ['TASK_ID'] -task_number=task_arn.split(":task/",1)[1] #get the task number id +task_arn=sys.argv[1] + +task_id=task_arn.split(":task/",1)[1] #get the task number id last_event = None +log_group_name='/ecs/'+cluster_name+'/'+app_name + +extra_args = { + 'logGroupName': log_group_name, + 'logStreamName': app_name+'/'+app_name+'/'+task_id, + 'startFromHead': True +} while True: try: response = aws_ecs.describe_tasks( cluster=cluster_name, tasks=[task_arn]) - - logs = boto3.client('logs') task_status = response['tasks'][0]['lastStatus'] - print('Task status', task_status) - logGroupName='/ecs/'+cluster_name+'/'+app_name - print('Searching logs for ', logGroupName) - time.sleep(5) - - logStreams = logs.describe_log_streams( - logGroupName=logGroupName, - logStreamNamePrefix=app_name+'/'+app_name+'/'+task_number, - limit=1, - descending=True) - - for stream in logStreams['logStreams']: - streamName=stream['logStreamName'] - print('log Streams', streamName) - logStreamEvents = logs.get_log_events( - logGroupName=logGroupName, - logStreamName=streamName, - startFromHead=True) - for log in logStreamEvents['events']: - print(log['message']) - if task_status == 'STOPPED': - break - time.sleep(5) + + log_stream_events = logs.get_log_events(**extra_args) + + for event in log_stream_events['events']: + print("%s" % (event['message'])) + + if 'nextToken' in extra_args and log_stream_events['nextForwardToken'] == extra_args['nextToken']: + if task_status == "STOPPED": + print("======== TASK STOPPED ========") + print("Task ID: %s" % task_id) + print("Task ARN: %s" % task_arn) + print("Service Name: %s" % app_name) + print("Cluster Name: %s" % cluster_name) + print("Started at: %s" % response['tasks'][0]['startedAt']) + print("Stopped at: %s" % response['tasks'][0]['stoppedAt']) + print("Stopped Reason: %s" % response['tasks'][0]['stoppedReason']) + if 'stopCode' in response['tasks'][0]: + print("Stop Code: %s" % response['tasks'][0]['stopCode']) + print("") + break + time.sleep(1) + else: + extra_args['nextToken'] = log_stream_events['nextForwardToken'] except Exception as e: - print("error: " + str(e)) + print("Error: " + str(e)) break