From 2b3864da158c9f3fa223b82b63276a55e250ae4b Mon Sep 17 00:00:00 2001 From: Marius Grama Date: Mon, 23 Jan 2023 22:54:16 +0100 Subject: [PATCH] Add CI infrastructure for `trino-redshift` Use an ephemeral/throw-away Redshift cluster for running integration tests on `trino-redshift` connector. Once the tests are run, the testing Redshift cluster is being reclaimed. The Redshift cluster is publicly accessible in order to be accessible from the general purpose Github runners. --- .github/bin/redshift/delete-aws-redshift.sh | 23 ++++ .github/bin/redshift/run-mvn-tests.sh | 40 +++++++ .github/bin/redshift/setup-aws-redshift.sh | 48 ++++++++ .github/workflows/ci.yml | 29 +++++ .gitignore | 1 + plugin/trino-redshift/README.md | 110 +++++++++++++++++- plugin/trino-redshift/pom.xml | 23 ++++ .../plugin/redshift/RedshiftQueryRunner.java | 2 +- 8 files changed, 274 insertions(+), 2 deletions(-) create mode 100755 .github/bin/redshift/delete-aws-redshift.sh create mode 100755 .github/bin/redshift/run-mvn-tests.sh create mode 100755 .github/bin/redshift/setup-aws-redshift.sh diff --git a/.github/bin/redshift/delete-aws-redshift.sh b/.github/bin/redshift/delete-aws-redshift.sh new file mode 100755 index 000000000000..99323d2479cb --- /dev/null +++ b/.github/bin/redshift/delete-aws-redshift.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -euo pipefail + +REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}" + +if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]]; then + echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" + exit 0 +fi + +REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier) + +echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER" +aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot + +echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted" +aws redshift wait cluster-deleted \ + --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER +echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted" + +rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier +exit 0 diff --git a/.github/bin/redshift/run-mvn-tests.sh b/.github/bin/redshift/run-mvn-tests.sh new file mode 100755 index 000000000000..bd67245e53c6 --- /dev/null +++ b/.github/bin/redshift/run-mvn-tests.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# TODO +# TODO Warning: this is just a temporary version of the script, to be replaced. +# TODO It has not been tidied up and doesn't yet correspond to how we write scripts. +# TODO + +# Runs Redshift connector tests. +# +# Run setup-aws-redshift.sh as a prerequisite for creating the Redshift instance and viewing +# required environment variables. +# +# Usage: +# run-mvn-tests.sh '-pl :trino-redshift' + +set -xeuo pipefail + +REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}" +PROJECT_ROOT="${REDSHIFT_SCRIPTS_DIR}/../.." + +cd "${PROJECT_ROOT}" || exit 1 + +suite_exit_code=0 + +${MAVEN} ${MAVEN_TEST}\ + test \ + -B -Dair.check.skip-all=true -Dmaven.javadoc.skip=true --fail-at-end \ + -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \ + -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \ + -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \ + -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \ + -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \ + -Dtest.redshift.aws.region="${AWS_REGION}" \ + -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \ + -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}" \ + "$@" || + suite_exit_code=1 + +echo "$0: exiting with ${suite_exit_code}" +exit "${suite_exit_code}" diff --git a/.github/bin/redshift/setup-aws-redshift.sh b/.github/bin/redshift/setup-aws-redshift.sh new file mode 100755 index 000000000000..fcc74f949c95 --- /dev/null +++ b/.github/bin/redshift/setup-aws-redshift.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -euo pipefail + +REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}" + +# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter +# Having no warranty that openssl will output a string following the password policy, add explicitly +# the string 'Red1!' to the password +REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!" + +REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8) + +REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ") + +echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}." +aws redshift create-cluster \ + --db-name testdb \ + --region ${AWS_REGION} \ + --node-type dc2.large \ + --number-of-nodes 1 \ + --master-username admin \ + --master-user-password ${REDSHIFT_PASSWORD} \ + --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \ + --cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \ + --cluster-type single-node\ + --vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \ + --iam-roles ${REDSHIFT_IAM_ROLES} \ + --automated-snapshot-retention-period 0 \ + --publicly-accessible \ + --tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL} + +echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier +echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available." + +# Wait for the cluster to become available +aws redshift wait cluster-available \ + --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} + +echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries." + +REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}) + +export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' ) +export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' ) +export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' ) +export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' ) +export REDSHIFT_PASSWORD diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b636411960e..e03ea69d44f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -536,6 +536,7 @@ jobs: - { modules: plugin/trino-postgresql } - { modules: plugin/trino-raptor-legacy } - { modules: plugin/trino-redis } + - { modules: plugin/trino-redshift, profile: cloud-tests } - { modules: plugin/trino-singlestore } - { modules: plugin/trino-sqlserver } - { modules: testing/trino-faulttolerant-tests, profile: default } @@ -676,6 +677,34 @@ jobs: -Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \ -Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \ -Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}" + - name: Cloud Redshift Tests + env: + AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }} + REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }} + REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }} + REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }} + REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }} + if: >- + contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') && + (env.AWS_ACCESS_KEY_ID != '' || env.AWS_SECRET_ACCESS_KEY != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '' || env.REDSHIFT_IAM_ROLES != '') + run: | + source .github/bin/redshift/setup-aws-resdshift.sh + + $MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \ + -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \ + -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \ + -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \ + -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \ + -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \ + -Dtest.redshift.aws.region="${AWS_REGION}" \ + -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \ + -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}" + - name: Cleanup ephemeral Redshift Cluster + # Cancelled workflows may have left the ephemeral cluster running + if: always() + run: .github/bin/redshift/delete-aws-redshift.sh - name: Sanitize artifact name if: always() run: | diff --git a/.gitignore b/.gitignore index 77a854990a1d..180ffef1f896 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ product-test-reports /impacted-features.log .github/test-matrix.yaml .github/test-pt-matrix.yaml +/bin/redshift/.cluster-identifier diff --git a/plugin/trino-redshift/README.md b/plugin/trino-redshift/README.md index 16229b145da1..d4c4ed672462 100644 --- a/plugin/trino-redshift/README.md +++ b/plugin/trino-redshift/README.md @@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket containing TPCH tiny data in Parquet format. The files should be named: ``` -s3:///tpch/tiny/.parquet +s3:///tpch/tiny//*.parquet ``` To run the tests set the following system properties: @@ -18,3 +18,111 @@ test.redshift.jdbc.password= test.redshift.s3.tpch.tables.root= test.redshift.iam.role= ``` + +## Redshift Cluster CI Infrastructure setup + +### AWS VPC setup +On _AWS VPC_ service se create a VPC - `redshift-vpc`. +Key properties to configure on the VPC: + +- `IPv4 CIDR`: `192.168.0.0/16` + +Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`. + +Create a subnet for the VPC `redshift-public-subnet`. +In the route table of the subnet make sure to add the route +`Destination 0.0.0.0/0` to `Target` the previously created +internet gateway `redshift-igw`. + +Create a Security Group `redshift-sg`. +Make the following adjustments in the security group to allow access to the +Redshift cluster from the general purpose Github CI runners: + +- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0` +- add an Outbound rule for `All traffic` to destination `0.0.0.0/0` + +### Amazon Redshift setup + +Create a subnet group `cluster-subnet-group-trino-ci` associated with +the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`. + +### AWS IAM setup + +Create the AWS IAM role `redshift-ci` and add to it +the `AmazonRedshiftAllCommandsFullAccess` policy. +This role will be passed to the ephemeral Redshift cluster to provide it with +the ability to execute `COPY` from AWS S3 bucket. + +Ensure that the AWS IAM user used by the CI process does have the ability to +create ephemeral Amazon Redshift clusters: + +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PassRoleToRedshiftCluster", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::894365193301:role/redshift-ci" + }, + { + "Sid": "RedshiftClusterManagement", + "Effect": "Allow", + "Action": [ + "redshift:DeleteTags", + "redshift:DeleteCluster", + "redshift:CreateTags", + "redshift:CreateCluster", + "redshift:DescribeClusters", + "redshift:DescribeLoggingStatus" + ], + "Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*" + }, + { + "Sid": "DescribeRedshiftVpcComponents", + "Effect": "Allow", + "Action": [ + "ec2:DescribeInternetGateways", + "ec2:DescribeAddresses", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeVpcs", + "ec2:DescribeAccountAttributes", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups" + ], + "Resource": "*" + } + ] +} +``` + +### AWS S3 setup + +The `trino-redshift` tests rely on a Redshift cluster +having TPCH tables filled with data. +Create an AWS S3 bucket and add to it the parquet content +of `tpch` tables saved locally through the `trino-hive` connector +via commands like: + +``` +CREATE TABLE hive.tiny.table_name AS TABLE tpch.sf1.table_name +``` + +The content of the S3 bucket should look like this: + +``` +s3:///tpch/tiny//*.parquet +``` + +where `table_name` is: + +- `customer` +- `lineitem` +- `nation` +- `orders` +- `part` +- `partsupp` +- `region` +- `supplier` + diff --git a/plugin/trino-redshift/pom.xml b/plugin/trino-redshift/pom.xml index 6c87fd1326cb..b1430fdb3e76 100644 --- a/plugin/trino-redshift/pom.xml +++ b/plugin/trino-redshift/pom.xml @@ -195,5 +195,28 @@ + + + cloud-tests + + false + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + **/TestRedshiftAutomaticJoinPushdown.java + **/TestRedshiftConnectorTest.java + **/TestRedshiftTableStatisticsReader.java + **/TestRedshiftTypeMapping.java + + + + + + diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java index 8b627642dd8e..20cd99f63b3a 100644 --- a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java +++ b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java @@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu private static void copyFromS3(QueryRunner queryRunner, Session session, String name) { - String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name); + String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name); log.info("Creating table %s in Redshift copying from %s", name, s3Path); // Create table in ephemeral Redshift cluster with no data