trinodb · findinpath · Jan 23, 2023
diff --git a/.github/bin/redshift/delete-aws-redshift.sh b/.github/bin/redshift/delete-aws-redshift.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
+
+if [[ ! -f "${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier" ]];  then
+    echo "Missing file ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier"
+    exit 0
+fi
+
+REDSHIFT_CLUSTER_IDENTIFIER=$(cat $REDSHIFT_SCRIPTS_DIR/.cluster-identifier)
+
+echo "Deleting Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER"
+aws redshift delete-cluster --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER --skip-final-cluster-snapshot
+
+echo "Waiting for the Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER to be deleted"
+aws redshift wait cluster-deleted \
+  --cluster-identifier $REDSHIFT_CLUSTER_IDENTIFIER
+echo "Amazon Redshift cluster $REDSHIFT_CLUSTER_IDENTIFIER has been deleted"
+
+rm -f ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
+exit 0
diff --git a/.github/bin/redshift/run-mvn-tests.sh b/.github/bin/redshift/run-mvn-tests.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# TODO
+# TODO Warning: this is just a temporary version of the script, to be replaced.
+# TODO It has not been tidied up and doesn't yet correspond to how we write scripts.
+# TODO
+
+# Runs Redshift connector tests.
+#
+# Run setup-aws-redshift.sh as a prerequisite for creating the Redshift instance and viewing
+# required environment variables.
+#
+# Usage:
+# run-mvn-tests.sh '-pl :trino-redshift'
+
+set -xeuo pipefail
+
+REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
+PROJECT_ROOT="${REDSHIFT_SCRIPTS_DIR}/../.."
+
+cd "${PROJECT_ROOT}" || exit 1
+
+suite_exit_code=0
+
+${MAVEN} ${MAVEN_TEST}\
+    test \
+    -B -Dair.check.skip-all=true -Dmaven.javadoc.skip=true --fail-at-end \
+    -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
+    -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
+    -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
+    -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
+    -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
+    -Dtest.redshift.aws.region="${AWS_REGION}" \
+    -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
+    -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}" \
+    "$@" ||
+    suite_exit_code=1
+
+echo "$0: exiting with ${suite_exit_code}"
+exit "${suite_exit_code}"
diff --git a/.github/bin/redshift/setup-aws-redshift.sh b/.github/bin/redshift/setup-aws-redshift.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REDSHIFT_SCRIPTS_DIR="${BASH_SOURCE%/*}"
+
+# Redshift requires passwords containing at least a digit, a lower case letter and a upper case letter
+# Having no warranty that openssl will output a string following the password policy, add explicitly
+# the string 'Red1!' to the password
+REDSHIFT_PASSWORD="$(openssl rand -base64 16 | tr -dc 'a-zA-Z0-9')Red1!"
+
+REDSHIFT_CLUSTER_IDENTIFIER=trino-redshift-ci-cluster-$(openssl rand -hex 8)
+
+REDSHIFT_CLUSTER_TTL=$(date -u -d "+2 hours" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -u -v "+2H" +"%Y-%m-%dT%H:%M:%SZ")
+
+echo "Creating the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION}."
+aws redshift create-cluster \
+  --db-name testdb \
+  --region ${AWS_REGION} \
+  --node-type dc2.large \
+  --number-of-nodes 1 \
+  --master-username admin \
+  --master-user-password ${REDSHIFT_PASSWORD} \
+  --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER} \
+  --cluster-subnet-group-name ${REDSHIFT_SUBNET_GROUP_NAME} \
+  --cluster-type single-node\
+  --vpc-security-group-ids "${REDSHIFT_VPC_SECURITY_GROUP_IDS}" \
+  --iam-roles ${REDSHIFT_IAM_ROLES} \
+  --automated-snapshot-retention-period 0 \
+  --publicly-accessible \
+  --tags Key=cloud,Value=aws Key=environment,Value=test Key=project,Value=trino-redshift Key=ttl,Value=${REDSHIFT_CLUSTER_TTL}
+
+echo ${REDSHIFT_CLUSTER_IDENTIFIER} > ${REDSHIFT_SCRIPTS_DIR}/.cluster-identifier
+echo "Waiting for the Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} to be available."
+
+# Wait for the cluster to become available
+aws redshift wait cluster-available \
+  --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER}
+
+echo "The Amazon Redshift cluster ${REDSHIFT_CLUSTER_IDENTIFIER} on the region ${AWS_REGION} is available for queries."
+
+REDSHIFT_CLUSTER_DESCRIPTION=$(aws redshift describe-clusters --cluster-identifier ${REDSHIFT_CLUSTER_IDENTIFIER})
+
+export REDSHIFT_ENDPOINT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Address' )
+export REDSHIFT_PORT=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].Endpoint.Port' )
+export REDSHIFT_CLUSTER_DATABASE_NAME=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].DBName' )
+export REDSHIFT_USER=$(echo ${REDSHIFT_CLUSTER_DESCRIPTION} | jq -r '.Clusters[0].MasterUsername' )
+export REDSHIFT_PASSWORD
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -536,6 +536,7 @@ jobs:
             - { modules: plugin/trino-postgresql }
             - { modules: plugin/trino-raptor-legacy }
             - { modules: plugin/trino-redis }
+            - { modules: plugin/trino-redshift, profile: cloud-tests }
             - { modules: plugin/trino-singlestore }
             - { modules: plugin/trino-sqlserver }
             - { modules: testing/trino-faulttolerant-tests, profile: default }
@@ -676,6 +677,34 @@ jobs:
             -Dhive.hadoop2.azure-abfs-container="${ABFS_CONTAINER}" \
             -Dhive.hadoop2.azure-abfs-account="${ABFS_ACCOUNT}" \
             -Dhive.hadoop2.azure-abfs-access-key="${ABFS_ACCESS_KEY}"
+      - name: Cloud Redshift Tests
+        env:
+          AWS_REGION: ${{ vars.REDSHIFT_AWS_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.REDSHIFT_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.REDSHIFT_AWS_SECRET_ACCESS_KEY }}
+          REDSHIFT_SUBNET_GROUP_NAME: ${{ vars.REDSHIFT_SUBNET_GROUP_NAME }}
+          REDSHIFT_IAM_ROLES: ${{ vars.REDSHIFT_IAM_ROLES }}
+          REDSHIFT_VPC_SECURITY_GROUP_IDS: ${{ vars.REDSHIFT_VPC_SECURITY_GROUP_IDS }}
+          REDSHIFT_S3_TPCH_TABLES_ROOT: ${{ vars.REDSHIFT_S3_TPCH_TABLES_ROOT }}
+        if: >-
+          contains(matrix.modules, 'trino-redshift') && contains(matrix.profile, 'cloud-tests') &&
+          (env.AWS_ACCESS_KEY_ID != '' || env.AWS_SECRET_ACCESS_KEY != '' || env.REDSHIFT_SUBNET_GROUP_NAME != '' || env.REDSHIFT_IAM_ROLES != '')
+        run: |
+          source .github/bin/redshift/setup-aws-resdshift.sh
+
+          $MAVEN test ${MAVEN_TEST} -pl :trino-redshift ${{ format('-P {0}', matrix.profile) }} \
+          -Dtest.redshift.jdbc.user="${REDSHIFT_USER}" \
+          -Dtest.redshift.jdbc.password="${REDSHIFT_PASSWORD}" \
+          -Dtest.redshift.jdbc.endpoint="${REDSHIFT_ENDPOINT}:${REDSHIFT_PORT}/" \
+          -Dtest.redshift.s3.tpch.tables.root="${REDSHIFT_S3_TPCH_TABLES_ROOT}" \
+          -Dtest.redshift.iam.role="${REDSHIFT_IAM_ROLES}" \
+          -Dtest.redshift.aws.region="${AWS_REGION}" \
+          -Dtest.redshift.aws.access-key="${AWS_ACCESS_KEY_ID}" \
+          -Dtest.redshift.aws.secret-key="${AWS_SECRET_ACCESS_KEY}"
+      - name: Cleanup ephemeral Redshift Cluster
+        # Cancelled workflows may have left the ephemeral cluster running
+        if: always()
+        run: .github/bin/redshift/delete-aws-redshift.sh
       - name: Sanitize artifact name
         if: always()
         run: |

diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,4 @@ product-test-reports
 /impacted-features.log
 .github/test-matrix.yaml
 .github/test-pt-matrix.yaml
+/bin/redshift/.cluster-identifier
diff --git a/plugin/trino-redshift/README.md b/plugin/trino-redshift/README.md
@@ -6,7 +6,7 @@ is a single dc2.large instance. Additionally, you will need a S3 bucket
 containing TPCH tiny data in Parquet format.  The files should be named:
 
 ```
-s3://<your_bucket>/tpch/tiny/<table_name>.parquet
+s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
 ```
 
 To run the tests set the following system properties:
@@ -18,3 +18,111 @@ test.redshift.jdbc.password=<password>
 test.redshift.s3.tpch.tables.root=<your_bucket>
 test.redshift.iam.role=<your_iam_arm_to_access_bucket>
 ```
+
+## Redshift Cluster CI Infrastructure setup
+
+### AWS VPC setup
+On _AWS VPC_ service se create a VPC - `redshift-vpc`.
+Key properties to configure on the VPC:
+
+- `IPv4 CIDR`: `192.168.0.0/16`
+
+Create for the `redshift-vpc` an Internet Gateway - `redshift-igw`.
+
+Create a subnet for the VPC `redshift-public-subnet`.
+In the route table of the subnet make sure to add the route 
+`Destination 0.0.0.0/0` to `Target` the previously created 
+internet gateway `redshift-igw`.
+
+Create a Security Group `redshift-sg`.
+Make the following adjustments in the security group to allow access to the 
+Redshift cluster from the general purpose Github CI runners:
+
+- add an Inbound rule accepting `All traffic` from Source `0.0.0.0/0`
+- add an Outbound rule for `All traffic` to destination `0.0.0.0/0`
+
+### Amazon Redshift setup
+
+Create a subnet group `cluster-subnet-group-trino-ci` associated with 
+the VPC `redshift-vpc` and the VPC subnet `redshift-public-subnet`.
+
+### AWS IAM setup
+
+Create the AWS IAM role `redshift-ci` and add to it 
+the `AmazonRedshiftAllCommandsFullAccess` policy.
+This role will be passed to the ephemeral Redshift cluster to provide it with 
+the ability to execute `COPY` from AWS S3 bucket.
+
+Ensure that the AWS IAM user used by the CI process does have the ability to 
+create ephemeral Amazon Redshift clusters: 
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "PassRoleToRedshiftCluster",
+            "Effect": "Allow",
+            "Action": "iam:PassRole",
+            "Resource": "arn:aws:iam::894365193301:role/redshift-ci"
+        },
+        {
+            "Sid": "RedshiftClusterManagement",
+            "Effect": "Allow",
+            "Action": [
+                "redshift:DeleteTags",
+                "redshift:DeleteCluster",
+                "redshift:CreateTags",
+                "redshift:CreateCluster",
+                "redshift:DescribeClusters",
+                "redshift:DescribeLoggingStatus"
+            ],
+            "Resource": "arn:aws:redshift:us-east-2:894365193301:cluster:trino-redshift-ci-cluster-*"
+        },
+        {
+            "Sid": "DescribeRedshiftVpcComponents",
+            "Effect": "Allow",
+            "Action": [
+                "ec2:DescribeInternetGateways",
+                "ec2:DescribeAddresses",
+                "ec2:DescribeAvailabilityZones",
+                "ec2:DescribeVpcs",
+                "ec2:DescribeAccountAttributes",
+                "ec2:DescribeSubnets",
+                "ec2:DescribeSecurityGroups"
+            ],
+            "Resource": "*"
+        }
+    ]
+}
+```
+
+### AWS S3 setup
+
+The `trino-redshift` tests rely on a Redshift cluster 
+having TPCH tables filled with data.
+Create an AWS S3 bucket and add to it the parquet content 
+of `tpch` tables saved locally through the `trino-hive` connector 
+via commands like:
+
+```
+CREATE TABLE hive.tiny.table_name AS TABLE tpch.sf1.table_name
+```
+
+The content of the S3 bucket should look like this:
+
+```
+s3://<your_bucket>/tpch/tiny/<table_name>/*.parquet
+```
+
+where `table_name` is:
+
+- `customer`
+- `lineitem`
+- `nation`
+- `orders`
+- `part`
+- `partsupp`
+- `region`
+- `supplier`
+
diff --git a/plugin/trino-redshift/pom.xml b/plugin/trino-redshift/pom.xml
@@ -195,5 +195,28 @@
                 </plugins>
             </build>
         </profile>
+
+        <profile>
+            <id>cloud-tests</id>
+            <activation>
+                <activeByDefault>false</activeByDefault>
+            </activation>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-surefire-plugin</artifactId>
+                        <configuration>
+                            <includes>
+                                <include>**/TestRedshiftAutomaticJoinPushdown.java</include>
+                                <include>**/TestRedshiftConnectorTest.java</include>
+                                <inclue>**/TestRedshiftTableStatisticsReader.java</inclue>
+                                <include>**/TestRedshiftTypeMapping.java</include>
+                            </includes>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
     </profiles>
 </project>
diff --git a/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java b/plugin/trino-redshift/src/test/java/io/trino/plugin/redshift/RedshiftQueryRunner.java
@@ -197,7 +197,7 @@ private static synchronized void provisionTables(Session session, QueryRunner qu
 
     private static void copyFromS3(QueryRunner queryRunner, Session session, String name)
     {
-        String s3Path = format("%s/%s/%s.parquet", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, name);
+        String s3Path = format("%s/%s/%s/%s/", S3_TPCH_TABLES_ROOT, TPCH_CATALOG, TINY_SCHEMA_NAME, name);
         log.info("Creating table %s in Redshift copying from %s", name, s3Path);
 
         // Create table in ephemeral Redshift cluster with no data