Skip to content

Commit

Permalink
Blue green cutover and improvements in logging (#7)
Browse files Browse the repository at this point in the history
* Changing how script waits for deployment to end (wip)

* Wait for deployment to be created

* Update Deploy.sh

Due the known issue on Codedeploy, CodeDeploy will fail the deployment if the ECS service is unhealthy/unstable for 5mins for replacement taskset during the wait status, this 5mins is a non-configurable value as today.
For the reason above we wait for 10 minutes before consider the deployment in ready status as successful

* Create stop.sh

Adding script stop.sh, to be triggered if customer doesn't approval the new version to be deployed, after accessing and testing the application through test listener

* Update Dockerfile

Adding script stop.sh

* Cleanup and working on status to fail early if user cancels or deployment fails, instead of waiting for 10 minutes

* Refactoring script to wait for deployment when cutover is enabled

Co-authored-by: elliot-dnx <[email protected]>
  • Loading branch information
adenot and elliot-dnx authored Apr 8, 2020
1 parent e1377da commit fee014f
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 82 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM dnxsolutions/aws:1.16.263-dnx2
FROM dnxsolutions/aws:1.17.14-dnx3

RUN apk --no-cache update && \
apk --no-cache add python3 && \
Expand All @@ -8,7 +8,7 @@ RUN pip3 install --no-cache --upgrade boto3

ADD src .

RUN chmod +x deploy.sh task-deploy.sh run-task.sh cutover.sh tail-task-logs.py
RUN chmod +x *.sh *.py

ENTRYPOINT [ "/bin/bash", "-c" ]

Expand Down
18 changes: 18 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version: '3.4'

services:
app:
build: .
volumes:
- .:/work
environment:
- AWS_ACCESS_KEY_ID
- AWS_ACCOUNT_ID
- AWS_DEFAULT_REGION
- AWS_ROLE
- AWS_SECRET_ACCESS_KEY
- AWS_SECURITY_TOKEN
- AWS_SESSION_EXPIRATION
- AWS_SESSION_TOKEN
entrypoint: ""
command: /bin/bash
21 changes: 5 additions & 16 deletions src/app-spec.tpl.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,6 @@
{
"version": 1,
"Resources": [
{
"TargetService": {
"Type": "AWS::ECS::Service",
"Properties": {
"TaskDefinition": "$TASK_ARN",
"LoadBalancerInfo": {
"ContainerName": "$APP_NAME",
"ContainerPort": $CONTAINER_PORT
}
}
}
}
]
}
"revisionType": "AppSpecContent",
"appSpecContent": {
"content": "{\"version\":1,\"Resources\":[{\"TargetService\":{\"Type\":\"AWS::ECS::Service\",\"Properties\":{\"TaskDefinition\":\"$TASK_ARN\",\"LoadBalancerInfo\":{\"ContainerName\":\"$APP_NAME\",\"ContainerPort\":$CONTAINER_PORT}}}}]}"
}
}
15 changes: 2 additions & 13 deletions src/cutover.sh → src/deploy-cutover.sh
Original file line number Diff line number Diff line change
@@ -1,31 +1,20 @@
#!/bin/bash -e

ERROR=0
if [[ -z "$AWS_DEFAULT_REGION" ]]; then echo "---> ERROR: Missing variable AWS_DEFAULT_REGION"; ERROR=1; fi
if [[ -z "$APP_NAME" ]]; then echo "---> ERROR: Missing variable APP_NAME"; ERROR=1; fi
if [[ -z "$CLUSTER_NAME" ]]; then echo "---> ERROR: Missing variable CLUSTER_NAME"; ERROR=1; fi
if [[ -z "$CONTAINER_PORT" ]]; then echo "---> ERROR: Missing variable CONTAINER_PORT"; ERROR=1; fi
if [[ -z "$IMAGE_NAME" ]]; then echo "---> ERROR: Missing variable IMAGE_NAME"; ERROR=1; fi
if [[ "$ERROR" == "1" ]]; then exit 1; fi

# Fetch deployment ID pending cutover to the green(new) enviroment
DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1)

DEPLOYMENT_PID=$!

#echo "---> For More Deployment info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID"

#echo "---> Waiting for Deployment ..."

aws deploy continue-deployment --deployment-id $DEPLOYMENT_ID --deployment-wait-type "READY_WAIT"

wait $DEPLOYMENT_PID
RET=$?

if [ $RET -eq 0 ]; then
echo "---> Deployment completed!"
echo "---> Cutover engaged!"
else
echo "---> ERROR: Deployment FAILED!"
echo "---> ERROR: Cutover FAILED!"
fi

exit $RET
20 changes: 20 additions & 0 deletions src/deploy-stop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash -e

if [[ -z "$AWS_DEFAULT_REGION" ]]; then echo "---> ERROR: Missing variable AWS_DEFAULT_REGION"; ERROR=1; fi
if [[ -z "$APP_NAME" ]]; then echo "---> ERROR: Missing variable APP_NAME"; ERROR=1; fi
if [[ -z "$CLUSTER_NAME" ]]; then echo "---> ERROR: Missing variable CLUSTER_NAME"; ERROR=1; fi

# Fetch deployment ID pending cutover to the green(new) enviroment
DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1)

aws deploy stop-deployment --deployment-id $DEPLOYMENT_ID

RET=$?

if [ $RET -eq 0 ]; then
echo "---> Deployment stopped!"
else
echo "---> ERROR: Deployment stopped FAILED!"
fi

exit $RET
110 changes: 89 additions & 21 deletions src/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,47 +17,115 @@ envsubst < task-definition.tpl.json > task-definition.json
echo "---> Task Definition"
cat task-definition.json

export TASK_ARN=TASK_ARN_PLACEHOLDER
export TASK_ARN=$(aws ecs register-task-definition --cli-input-json file://./task-definition.json | jq --raw-output '.taskDefinition.taskDefinitionArn')

envsubst < app-spec.tpl.json > app-spec.json
echo
echo "---> App-spec for CodeDeploy"
cat app-spec.json

echo
echo "---> Creating deployment with CodeDeploy"

set +e # disable bash exit on error

# Update the ECS service to use the updated Task version
aws ecs deploy \
--service $APP_NAME \
--task-definition ./task-definition.json \
--cluster $CLUSTER_NAME \
--codedeploy-appspec ./app-spec.json \
--codedeploy-application $CLUSTER_NAME-$APP_NAME \
--codedeploy-deployment-group $CLUSTER_NAME-$APP_NAME &
# # Update the ECS service to use the updated Task version
DEPLOYMENT_ID=$(aws deploy create-deployment \
--application-name $CLUSTER_NAME-$APP_NAME \
--deployment-config-name CodeDeployDefault.ECSAllAtOnce \
--deployment-group-name $CLUSTER_NAME-$APP_NAME \
--description Deployment \
--revision file://app-spec.json \
--query="deploymentId" --output text)

# In case there is already a deployment in progress, script will fail
if [ $? -eq 255 ]; then
echo
echo
echo "===> Deployment already in progress. Please approve current deployment before performing a new deployment"
echo
echo
exit 1
fi

sleep 5 # Wait for deployment to be created

DEPLOYMENT_PID=$!
echo "---> For more info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID"

sleep 5 # Wait for deployment to be created so we can fetch DEPLOYMENT_ID next
/work/tail-ecs-events.py &
TAIL_ECS_EVENTS_PID=$!

DEPLOYMENT_ID=$(aws deploy list-deployments --application-name=$CLUSTER_NAME-$APP_NAME --deployment-group=$CLUSTER_NAME-$APP_NAME --max-items=1 --query="deployments[0]" --output=text | head -n 1)
RET=0

echo "---> For More Deployment info: https://$AWS_DEFAULT_REGION.console.aws.amazon.com/codesuite/codedeploy/deployments/$DEPLOYMENT_ID"
while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "Created" ]
do
sleep 1
done

echo "---> Waiting for Deployment ..."
echo "---> Deployment created!"

/work/tail-ecs-events.py &
TAIL_PID=$!
while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "InProgress" ]
do
sleep 1
done

wait $DEPLOYMENT_PID
RET=$?
TASK_SET_ID=$(aws ecs describe-services --cluster $CLUSTER_NAME --service $APP_NAME --query "services[0].taskSets[?status == 'ACTIVE'].id" --output text)
if [ "${TASK_SET_ID}" != "" ]; then
echo "---> Task Set ID: $TASK_SET_ID"
fi

# Due the known issue on Codedeploy, CodeDeploy will fail the deployment if the ECS service is unhealthy/unstable for 5mins for replacement
# taskset during the wait status, this 5mins is a non-configurable value as today.
# For the reason above we wait for 10 minutes before consider the deployment in ready status as successful

WAIT_PERIOD=0
MAX_WAIT=300 #$(aws ecs describe-services --cluster $CLUSTER_NAME --service $APP_NAME --query services[0].healthCheckGracePeriodSeconds --output text)
MAX_WAIT_BUFFER=60

echo
echo
echo "---> Waiting $((MAX_WAIT + MAX_WAIT_BUFFER)) seconds for tasks to stabilise"
echo

while [ "$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)" == "Ready" ]
do
if [ "$WAIT_PERIOD" -ge "$((MAX_WAIT + MAX_WAIT_BUFFER))" ]; then
break
fi
sleep 10
WAIT_PERIOD=$((WAIT_PERIOD + 10))
done

DEPLOYMENT_STATUS=$(aws deploy get-deployment --deployment-id $DEPLOYMENT_ID --query deploymentInfo.status --output text)
echo
echo "---> Deployment status: $DEPLOYMENT_STATUS"
echo

if [ "$DEPLOYMENT_STATUS" == "Failed" ]
then
TASK_ARN=$(aws ecs list-tasks --cluster dev --desired-status STOPPED --started-by $TASK_SET_ID --query taskArns[0] --output text)
if [ "${TASK_ARN}" != "None" ]; then
echo "---> Displaying logs of STOPPED task: $TASK_ARN"
echo
/work/tail-task-logs.py $TASK_ARN
fi
RET=1
elif [ "$DEPLOYMENT_STATUS" == "Stopped" ]
then
RET=1
elif [ "$DEPLOYMENT_STATUS" == "Succeeded" ]
then
RET=0
fi

if [ $RET -eq 0 ]; then
echo "---> Deployment completed!"
echo
echo "---> Completed!"
else
echo
echo "---> ERROR: Deployment FAILED!"
fi

kill $TAIL_PID
kill $TAIL_ECS_EVENTS_PID

exit $RET
exit $RET
2 changes: 1 addition & 1 deletion src/tail-ecs-events.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
events_collected.insert(0, event)

for event_collected in events_collected:
print('%s\t%s' % ('{0:%Y-%m-%d %H:%M:%S %z}'.format(event_collected['createdAt']), event_collected['message']))
print('%s %s' % ('{0:%Y-%m-%d %H:%M:%S %z}'.format(event_collected['createdAt']), event_collected['message']))

last_event = events[0]['id']
time.sleep(5)
Expand Down
65 changes: 36 additions & 29 deletions src/tail-task-logs.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,56 @@
#!/usr/bin/env python3

import boto3, json, time, os, datetime
import boto3, json, time, os, datetime, sys

aws_ecs = boto3.client('ecs')
logs = boto3.client('logs')

cluster_name=os.environ['CLUSTER_NAME']
app_name=os.environ['APP_NAME']
task_arn=os.environ['TASK_ID']
task_number=task_arn.split(":task/",1)[1] #get the task number id
task_arn=sys.argv[1]

task_id=task_arn.split(":task/",1)[1] #get the task number id
last_event = None
log_group_name='/ecs/'+cluster_name+'/'+app_name

extra_args = {
'logGroupName': log_group_name,
'logStreamName': app_name+'/'+app_name+'/'+task_id,
'startFromHead': True
}

while True:
try:
response = aws_ecs.describe_tasks(
cluster=cluster_name,
tasks=[task_arn])

logs = boto3.client('logs')
task_status = response['tasks'][0]['lastStatus']
print('Task status', task_status)
logGroupName='/ecs/'+cluster_name+'/'+app_name
print('Searching logs for ', logGroupName)
time.sleep(5)

logStreams = logs.describe_log_streams(
logGroupName=logGroupName,
logStreamNamePrefix=app_name+'/'+app_name+'/'+task_number,
limit=1,
descending=True)

for stream in logStreams['logStreams']:
streamName=stream['logStreamName']
print('log Streams', streamName)
logStreamEvents = logs.get_log_events(
logGroupName=logGroupName,
logStreamName=streamName,
startFromHead=True)
for log in logStreamEvents['events']:
print(log['message'])
if task_status == 'STOPPED':
break
time.sleep(5)

log_stream_events = logs.get_log_events(**extra_args)

for event in log_stream_events['events']:
print("%s" % (event['message']))

if 'nextToken' in extra_args and log_stream_events['nextForwardToken'] == extra_args['nextToken']:
if task_status == "STOPPED":
print("======== TASK STOPPED ========")
print("Task ID: %s" % task_id)
print("Task ARN: %s" % task_arn)
print("Service Name: %s" % app_name)
print("Cluster Name: %s" % cluster_name)
print("Started at: %s" % response['tasks'][0]['startedAt'])
print("Stopped at: %s" % response['tasks'][0]['stoppedAt'])
print("Stopped Reason: %s" % response['tasks'][0]['stoppedReason'])
if 'stopCode' in response['tasks'][0]:
print("Stop Code: %s" % response['tasks'][0]['stopCode'])
print("")
break
time.sleep(1)
else:
extra_args['nextToken'] = log_stream_events['nextForwardToken']

except Exception as e:
print("error: " + str(e))
print("Error: " + str(e))
break


0 comments on commit fee014f

Please sign in to comment.