Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/bin/redshift/delete-aws-redshift.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash

set -euo pipefail

REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"

if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]]; then
echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier"
exit 0
fi

REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier)

echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER"
aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot

echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted"
aws redshift wait cluster-deleted \
--cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER
echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted"

rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
exit 0
40 changes: 40 additions & 0 deletions .github/bin/redshift/run-mvn-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env bash

# TODO
# TODO Warning: this is just a temporary version of the script, to be replaced.
# TODO It has not been tidied up and doesn't yet correspond to how we write scripts.
# TODO

# Runs Redshift connector tests.
#
# Run setup-aws-redshift.sh as a prerequisite for creating the Redshift instance and viewing
# required environment variables.
#
# Usage:
# run-mvn-tests.sh '-pl :trino-redshift'

set -xeuo pipefail

REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
PROJECT_ROOT="${REDSHIFT_SCRIPTS_DIR}/../.."

cd "${PROJECT_ROOT}" || exit 1

suite_exit_code=0

${MAVEN} ${MAVEN_TEST}\
test \
-B -Dair.check.skip-all=true -Dmaven.javadoc.skip=true --fail-at-end \
-Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
-Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
-Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
-Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
-Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
-Dtest.redshift.aws.region="${AWS_REGION}" \
-Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
-Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}" \
"$@" ||
suite_exit_code=1

echo "$0: exiting with ${suite_exit_code}"
exit "${suite_exit_code}"
48 changes: 48 additions & 0 deletions .github/bin/redshift/setup-aws-redshift.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash

set -euo pipefail

REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"

# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter
# Having no warranty that openssl will output a string following the password policy, add explicitly
# the string 'Red1!' to the password
REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!"

REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8)

REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ")

echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}."
aws redshift create-cluster \
--db-name testdb \
--region ${AWS_REGION} \
--node-type dc2.large \
--number-of-nodes 1 \
--master-username admin \
--master-user-password ${REDSHIFT_PASSWORD} \
--cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \
--cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \
--cluster-type single-node\
--vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \
--iam-roles ${REDSHIFT_IAM_ROLES} \
--automated-snapshot-retention-period 0 \
--publicly-accessible \
--tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL}

echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available."

# Wait for the cluster to become available
aws redshift wait cluster-available \
--cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}

echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries."

REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER})

export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' )
export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' )
export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' )
export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' )
export REDSHIFT_PASSWORD
29 changes: 29 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ jobs:
- { modules: plugin/trino-postgresql }
- { modules: plugin/trino-raptor-legacy }
- { modules: plugin/trino-redis }
- { modules: plugin/trino-redshift, profile: cloud-tests }
- { modules: plugin/trino-singlestore }
- { modules: plugin/trino-sqlserver }
- { modules: testing/trino-faulttolerant-tests, profile: default }
Expand Down Expand Up @@ -676,6 +677,34 @@ jobs:
-Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \
-Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \
-Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}"
- name: Cloud Redshift Tests
env:
AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }}
REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }}
REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }}
REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }}
if: >-
contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') &&
(env.AWS_ACCESS_KEY_ID != '' || env.AWS_SECRET_ACCESS_KEY != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '' || env.REDSHIFT_IAM_ROLES != '')
run: |
source .github/bin/redshift/setup-aws-resdshift.sh

$MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \
-Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
-Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
-Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
-Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
-Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
-Dtest.redshift.aws.region="${AWS_REGION}" \
-Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
-Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}"
- name: Cleanup ephemeral Redshift Cluster
# Cancelled workflows may have left the ephemeral cluster running
if: always()
run: .github/bin/redshift/delete-aws-redshift.sh
- name: Sanitize artifact name
if: always()
run: |
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ product-test-reports
/impacted-features.log
.github/test-matrix.yaml
.github/test-pt-matrix.yaml
/bin/redshift/.cluster-identifier
110 changes: 109 additions & 1 deletion plugin/trino-redshift/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket
containing TPCH tiny data in Parquet format. The files should be named:

```
s3://<your_bucket>/tpch/tiny/<table_name>.parquet
s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
```

To run the tests set the following system properties:
Expand All @@ -18,3 +18,111 @@ test.redshift.jdbc.password=<password>
test.redshift.s3.tpch.tables.root=<your_bucket>
test.redshift.iam.role=<your_iam_arm_to_access_bucket>
```

## Redshift Cluster CI Infrastructure setup

### AWS VPC setup
On _AWS VPC_ service se create a VPC - `redshift-vpc`.
Key properties to configure on the VPC:

- `IPv4 CIDR`: `192.168.0.0/16`

Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`.

Create a subnet for the VPC `redshift-public-subnet`.
In the route table of the subnet make sure to add the route
`Destination 0.0.0.0/0` to `Target` the previously created
internet gateway `redshift-igw`.

Create a Security Group `redshift-sg`.
Make the following adjustments in the security group to allow access to the
Redshift cluster from the general purpose Github CI runners:

- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0`
- add an Outbound rule for `All traffic` to destination `0.0.0.0/0`

### Amazon Redshift setup

Create a subnet group `cluster-subnet-group-trino-ci` associated with
the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`.

### AWS IAM setup

Create the AWS IAM role `redshift-ci` and add to it
the `AmazonRedshiftAllCommandsFullAccess` policy.
This role will be passed to the ephemeral Redshift cluster to provide it with
the ability to execute `COPY` from AWS S3 bucket.

Ensure that the AWS IAM user used by the CI process does have the ability to
create ephemeral Amazon Redshift clusters:

```
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "PassRoleToRedshiftCluster",
"Effect": "Allow",
"Action": "iam:PassRole",
"Resource": "arn:aws:iam::894365193301:role/redshift-ci"
},
{
"Sid": "RedshiftClusterManagement",
"Effect": "Allow",
"Action": [
"redshift:DeleteTags",
"redshift:DeleteCluster",
"redshift:CreateTags",
"redshift:CreateCluster",
"redshift:DescribeClusters",
"redshift:DescribeLoggingStatus"
],
"Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*"
},
{
"Sid": "DescribeRedshiftVpcComponents",
"Effect": "Allow",
"Action": [
"ec2:DescribeInternetGateways",
"ec2:DescribeAddresses",
"ec2:DescribeAvailabilityZones",
"ec2:DescribeVpcs",
"ec2:DescribeAccountAttributes",
"ec2:DescribeSubnets",
"ec2:DescribeSecurityGroups"
],
"Resource": "*"
}
]
}
```

### AWS S3 setup

The `trino-redshift` tests rely on a Redshift cluster
having TPCH tables filled with data.
Create an AWS S3 bucket and add to it the parquet content
of `tpch` tables saved locally through the `trino-hive` connector
via commands like:

```
CREATE TABLE hive.tiny.table_name AS TABLE tpch.sf1.table_name
```

The content of the S3 bucket should look like this:

```
s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
```

where `table_name` is:

- `customer`
- `lineitem`
- `nation`
- `orders`
- `part`
- `partsupp`
- `region`
- `supplier`

23 changes: 23 additions & 0 deletions plugin/trino-redshift/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -195,5 +195,28 @@
</plugins>
</build>
</profile>

<profile>
<id>cloud-tests</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<includes>
<include>**/TestRedshiftAutomaticJoinPushdown.java</include>
<include>**/TestRedshiftConnectorTest.java</include>
<inclue>**/TestRedshiftTableStatisticsReader.java</inclue>
<include>**/TestRedshiftTypeMapping.java</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu

private static void copyFromS3(QueryRunner queryRunner, Session session, String name)
{
String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name);
String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name);
log.info("Creating table %s in Redshift copying from %s", name, s3Path);

// Create table in ephemeral Redshift cluster with no data
Expand Down